From 7debaaf365240f17d63a7cf889c6747d0b7d1552 Mon Sep 17 00:00:00 2001 From: Farid Zakaria Date: Sat, 24 Jun 2023 23:17:56 +0000 Subject: [PATCH] introduce tbb library --- Makefile | 1 + third_party/tbb/README.cosmo | 17 + third_party/tbb/address_waiter.cpp | 107 + third_party/tbb/allocator.cpp | 314 ++ third_party/tbb/arena.cpp | 858 +++++ third_party/tbb/arena.h | 511 +++ third_party/tbb/arena_slot.cpp | 219 ++ third_party/tbb/arena_slot.h | 415 ++ third_party/tbb/assert_impl.h | 98 + third_party/tbb/blocked_range.h | 171 + third_party/tbb/blocked_range2d.h | 112 + third_party/tbb/blocked_range3d.h | 131 + third_party/tbb/blocked_rangeNd.h | 148 + third_party/tbb/cache_aligned_allocator.h | 190 + third_party/tbb/cancellation_disseminator.h | 86 + third_party/tbb/co_context.h | 428 +++ third_party/tbb/collaborative_call_once.h | 236 ++ third_party/tbb/combinable.h | 70 + third_party/tbb/concurrent_bounded_queue.cpp | 85 + third_party/tbb/concurrent_hash_map.h | 1665 ++++++++ third_party/tbb/concurrent_lru_cache.h | 375 ++ third_party/tbb/concurrent_map.h | 351 ++ third_party/tbb/concurrent_monitor.h | 489 +++ third_party/tbb/concurrent_monitor_mutex.h | 114 + third_party/tbb/concurrent_priority_queue.h | 491 +++ third_party/tbb/concurrent_queue.h | 701 ++++ third_party/tbb/concurrent_set.h | 268 ++ third_party/tbb/concurrent_unordered_map.h | 415 ++ third_party/tbb/concurrent_unordered_set.h | 334 ++ third_party/tbb/concurrent_vector.h | 1130 ++++++ third_party/tbb/detail/_aggregator.h | 177 + third_party/tbb/detail/_aligned_space.h | 47 + third_party/tbb/detail/_allocator_traits.h | 108 + third_party/tbb/detail/_assert.h | 65 + third_party/tbb/detail/_attach.h | 33 + .../tbb/detail/_concurrent_queue_base.h | 651 ++++ .../tbb/detail/_concurrent_skip_list.h | 1291 +++++++ .../tbb/detail/_concurrent_unordered_base.h | 1515 ++++++++ third_party/tbb/detail/_config.h | 530 +++ third_party/tbb/detail/_containers_helpers.h | 68 + third_party/tbb/detail/_exception.h | 89 + third_party/tbb/detail/_export.h | 47 + .../tbb/detail/_flow_graph_body_impl.h | 386 ++ .../tbb/detail/_flow_graph_cache_impl.h | 435 +++ third_party/tbb/detail/_flow_graph_impl.h | 477 +++ .../tbb/detail/_flow_graph_indexer_impl.h | 352 ++ .../tbb/detail/_flow_graph_item_buffer_impl.h | 280 ++ .../tbb/detail/_flow_graph_join_impl.h | 1709 +++++++++ .../tbb/detail/_flow_graph_node_impl.h | 775 ++++ .../tbb/detail/_flow_graph_node_set_impl.h | 266 ++ .../tbb/detail/_flow_graph_nodes_deduction.h | 278 ++ .../detail/_flow_graph_tagged_buffer_impl.h | 258 ++ .../tbb/detail/_flow_graph_trace_impl.h | 365 ++ .../tbb/detail/_flow_graph_types_impl.h | 408 ++ third_party/tbb/detail/_hash_compare.h | 148 + third_party/tbb/detail/_intrusive_list_node.h | 42 + third_party/tbb/detail/_machine.h | 397 ++ third_party/tbb/detail/_mutex_common.h | 62 + third_party/tbb/detail/_namespace_injection.h | 25 + third_party/tbb/detail/_node_handle.h | 163 + third_party/tbb/detail/_pipeline_filters.h | 456 +++ .../tbb/detail/_pipeline_filters_deduction.h | 47 + third_party/tbb/detail/_range_common.h | 131 + third_party/tbb/detail/_rtm_mutex.h | 163 + third_party/tbb/detail/_rtm_rw_mutex.h | 216 ++ third_party/tbb/detail/_scoped_lock.h | 175 + third_party/tbb/detail/_segment_table.h | 567 +++ third_party/tbb/detail/_small_object_pool.h | 109 + third_party/tbb/detail/_string_resource.h | 79 + third_party/tbb/detail/_task.h | 233 ++ third_party/tbb/detail/_task_handle.h | 123 + third_party/tbb/detail/_template_helpers.h | 404 ++ third_party/tbb/detail/_utils.h | 394 ++ third_party/tbb/detail/_waitable_atomic.h | 105 + third_party/tbb/dynamic_link.cpp | 516 +++ third_party/tbb/dynamic_link.h | 137 + third_party/tbb/enumerable_thread_specific.h | 1135 ++++++ third_party/tbb/environment.h | 82 + third_party/tbb/exception.cpp | 167 + third_party/tbb/flow_graph.h | 3377 +++++++++++++++++ third_party/tbb/flow_graph_abstractions.h | 52 + third_party/tbb/global_control.cpp | 281 ++ third_party/tbb/global_control.h | 201 + third_party/tbb/governor.cpp | 580 +++ third_party/tbb/governor.h | 157 + third_party/tbb/info.h | 126 + third_party/tbb/intrusive_list.h | 234 ++ third_party/tbb/itt_notify.cpp | 70 + third_party/tbb/itt_notify.h | 118 + third_party/tbb/mailbox.h | 247 ++ third_party/tbb/main.cpp | 172 + third_party/tbb/main.h | 100 + third_party/tbb/market.cpp | 140 + third_party/tbb/market.h | 79 + third_party/tbb/memory_pool.h | 273 ++ third_party/tbb/misc.cpp | 176 + third_party/tbb/misc.h | 298 ++ third_party/tbb/misc_ex.cpp | 457 +++ third_party/tbb/mutex.h | 94 + third_party/tbb/null_mutex.h | 81 + third_party/tbb/null_rw_mutex.h | 88 + third_party/tbb/observer_proxy.cpp | 320 ++ third_party/tbb/observer_proxy.h | 153 + third_party/tbb/parallel_for.h | 470 +++ third_party/tbb/parallel_for_each.h | 682 ++++ third_party/tbb/parallel_invoke.h | 228 ++ third_party/tbb/parallel_pipeline.cpp | 472 +++ third_party/tbb/parallel_pipeline.h | 154 + third_party/tbb/parallel_reduce.h | 772 ++++ third_party/tbb/parallel_scan.h | 631 +++ third_party/tbb/parallel_sort.h | 289 ++ third_party/tbb/partitioner.h | 682 ++++ third_party/tbb/permit_manager.h | 61 + third_party/tbb/pm_client.h | 71 + third_party/tbb/private_server.cpp | 437 +++ third_party/tbb/profiling.cpp | 268 ++ third_party/tbb/profiling.h | 259 ++ third_party/tbb/queuing_mutex.h | 193 + third_party/tbb/queuing_rw_mutex.cpp | 618 +++ third_party/tbb/queuing_rw_mutex.h | 208 + third_party/tbb/rml_base.h | 182 + third_party/tbb/rml_tbb.cpp | 113 + third_party/tbb/rml_tbb.h | 95 + third_party/tbb/rml_thread_monitor.h | 277 ++ third_party/tbb/rtm_mutex.cpp | 122 + third_party/tbb/rtm_rw_mutex.cpp | 272 ++ third_party/tbb/rw_mutex.h | 217 ++ third_party/tbb/scalable_allocator.h | 338 ++ third_party/tbb/scheduler_common.h | 599 +++ third_party/tbb/semaphore.cpp | 93 + third_party/tbb/semaphore.h | 331 ++ third_party/tbb/small_object_pool.cpp | 155 + third_party/tbb/small_object_pool_impl.h | 60 + third_party/tbb/spin_mutex.h | 135 + third_party/tbb/spin_rw_mutex.h | 230 ++ third_party/tbb/task.cpp | 228 ++ third_party/tbb/task.h | 38 + third_party/tbb/task_arena.h | 500 +++ third_party/tbb/task_dispatcher.cpp | 245 ++ third_party/tbb/task_dispatcher.h | 469 +++ third_party/tbb/task_group.h | 747 ++++ third_party/tbb/task_group_context.cpp | 359 ++ third_party/tbb/task_scheduler_observer.h | 117 + third_party/tbb/task_stream.h | 287 ++ third_party/tbb/tbb.h | 75 + third_party/tbb/tbb.mk | 43 + third_party/tbb/tbb.rc | 75 + third_party/tbb/tbb_allocator.h | 127 + third_party/tbb/tbbmalloc_proxy.h | 66 + third_party/tbb/thread_control_monitor.h | 117 + third_party/tbb/thread_data.h | 260 ++ third_party/tbb/thread_dispatcher.cpp | 225 ++ third_party/tbb/thread_dispatcher.h | 107 + third_party/tbb/thread_dispatcher_client.h | 65 + third_party/tbb/thread_request_serializer.cpp | 139 + third_party/tbb/thread_request_serializer.h | 83 + third_party/tbb/threading_control.cpp | 392 ++ third_party/tbb/threading_control.h | 153 + third_party/tbb/threading_control_client.h | 59 + third_party/tbb/tick_count.h | 100 + third_party/tbb/tls.h | 103 + third_party/tbb/version.cpp | 27 + third_party/tbb/version.h | 115 + third_party/tbb/waiters.h | 202 + third_party/third_party.mk | 1 + 165 files changed, 50328 insertions(+) create mode 100644 third_party/tbb/README.cosmo create mode 100644 third_party/tbb/address_waiter.cpp create mode 100644 third_party/tbb/allocator.cpp create mode 100644 third_party/tbb/arena.cpp create mode 100644 third_party/tbb/arena.h create mode 100644 third_party/tbb/arena_slot.cpp create mode 100644 third_party/tbb/arena_slot.h create mode 100644 third_party/tbb/assert_impl.h create mode 100644 third_party/tbb/blocked_range.h create mode 100644 third_party/tbb/blocked_range2d.h create mode 100644 third_party/tbb/blocked_range3d.h create mode 100644 third_party/tbb/blocked_rangeNd.h create mode 100644 third_party/tbb/cache_aligned_allocator.h create mode 100644 third_party/tbb/cancellation_disseminator.h create mode 100644 third_party/tbb/co_context.h create mode 100644 third_party/tbb/collaborative_call_once.h create mode 100644 third_party/tbb/combinable.h create mode 100644 third_party/tbb/concurrent_bounded_queue.cpp create mode 100644 third_party/tbb/concurrent_hash_map.h create mode 100644 third_party/tbb/concurrent_lru_cache.h create mode 100644 third_party/tbb/concurrent_map.h create mode 100644 third_party/tbb/concurrent_monitor.h create mode 100644 third_party/tbb/concurrent_monitor_mutex.h create mode 100644 third_party/tbb/concurrent_priority_queue.h create mode 100644 third_party/tbb/concurrent_queue.h create mode 100644 third_party/tbb/concurrent_set.h create mode 100644 third_party/tbb/concurrent_unordered_map.h create mode 100644 third_party/tbb/concurrent_unordered_set.h create mode 100644 third_party/tbb/concurrent_vector.h create mode 100644 third_party/tbb/detail/_aggregator.h create mode 100644 third_party/tbb/detail/_aligned_space.h create mode 100644 third_party/tbb/detail/_allocator_traits.h create mode 100644 third_party/tbb/detail/_assert.h create mode 100644 third_party/tbb/detail/_attach.h create mode 100644 third_party/tbb/detail/_concurrent_queue_base.h create mode 100644 third_party/tbb/detail/_concurrent_skip_list.h create mode 100644 third_party/tbb/detail/_concurrent_unordered_base.h create mode 100644 third_party/tbb/detail/_config.h create mode 100644 third_party/tbb/detail/_containers_helpers.h create mode 100644 third_party/tbb/detail/_exception.h create mode 100644 third_party/tbb/detail/_export.h create mode 100644 third_party/tbb/detail/_flow_graph_body_impl.h create mode 100644 third_party/tbb/detail/_flow_graph_cache_impl.h create mode 100644 third_party/tbb/detail/_flow_graph_impl.h create mode 100644 third_party/tbb/detail/_flow_graph_indexer_impl.h create mode 100644 third_party/tbb/detail/_flow_graph_item_buffer_impl.h create mode 100644 third_party/tbb/detail/_flow_graph_join_impl.h create mode 100644 third_party/tbb/detail/_flow_graph_node_impl.h create mode 100644 third_party/tbb/detail/_flow_graph_node_set_impl.h create mode 100644 third_party/tbb/detail/_flow_graph_nodes_deduction.h create mode 100644 third_party/tbb/detail/_flow_graph_tagged_buffer_impl.h create mode 100644 third_party/tbb/detail/_flow_graph_trace_impl.h create mode 100644 third_party/tbb/detail/_flow_graph_types_impl.h create mode 100644 third_party/tbb/detail/_hash_compare.h create mode 100644 third_party/tbb/detail/_intrusive_list_node.h create mode 100644 third_party/tbb/detail/_machine.h create mode 100644 third_party/tbb/detail/_mutex_common.h create mode 100644 third_party/tbb/detail/_namespace_injection.h create mode 100644 third_party/tbb/detail/_node_handle.h create mode 100644 third_party/tbb/detail/_pipeline_filters.h create mode 100644 third_party/tbb/detail/_pipeline_filters_deduction.h create mode 100644 third_party/tbb/detail/_range_common.h create mode 100644 third_party/tbb/detail/_rtm_mutex.h create mode 100644 third_party/tbb/detail/_rtm_rw_mutex.h create mode 100644 third_party/tbb/detail/_scoped_lock.h create mode 100644 third_party/tbb/detail/_segment_table.h create mode 100644 third_party/tbb/detail/_small_object_pool.h create mode 100644 third_party/tbb/detail/_string_resource.h create mode 100644 third_party/tbb/detail/_task.h create mode 100644 third_party/tbb/detail/_task_handle.h create mode 100644 third_party/tbb/detail/_template_helpers.h create mode 100644 third_party/tbb/detail/_utils.h create mode 100644 third_party/tbb/detail/_waitable_atomic.h create mode 100644 third_party/tbb/dynamic_link.cpp create mode 100644 third_party/tbb/dynamic_link.h create mode 100644 third_party/tbb/enumerable_thread_specific.h create mode 100644 third_party/tbb/environment.h create mode 100644 third_party/tbb/exception.cpp create mode 100644 third_party/tbb/flow_graph.h create mode 100644 third_party/tbb/flow_graph_abstractions.h create mode 100644 third_party/tbb/global_control.cpp create mode 100644 third_party/tbb/global_control.h create mode 100644 third_party/tbb/governor.cpp create mode 100644 third_party/tbb/governor.h create mode 100644 third_party/tbb/info.h create mode 100644 third_party/tbb/intrusive_list.h create mode 100644 third_party/tbb/itt_notify.cpp create mode 100644 third_party/tbb/itt_notify.h create mode 100644 third_party/tbb/mailbox.h create mode 100644 third_party/tbb/main.cpp create mode 100644 third_party/tbb/main.h create mode 100644 third_party/tbb/market.cpp create mode 100644 third_party/tbb/market.h create mode 100644 third_party/tbb/memory_pool.h create mode 100644 third_party/tbb/misc.cpp create mode 100644 third_party/tbb/misc.h create mode 100644 third_party/tbb/misc_ex.cpp create mode 100644 third_party/tbb/mutex.h create mode 100644 third_party/tbb/null_mutex.h create mode 100644 third_party/tbb/null_rw_mutex.h create mode 100644 third_party/tbb/observer_proxy.cpp create mode 100644 third_party/tbb/observer_proxy.h create mode 100644 third_party/tbb/parallel_for.h create mode 100644 third_party/tbb/parallel_for_each.h create mode 100644 third_party/tbb/parallel_invoke.h create mode 100644 third_party/tbb/parallel_pipeline.cpp create mode 100644 third_party/tbb/parallel_pipeline.h create mode 100644 third_party/tbb/parallel_reduce.h create mode 100644 third_party/tbb/parallel_scan.h create mode 100644 third_party/tbb/parallel_sort.h create mode 100644 third_party/tbb/partitioner.h create mode 100644 third_party/tbb/permit_manager.h create mode 100644 third_party/tbb/pm_client.h create mode 100644 third_party/tbb/private_server.cpp create mode 100644 third_party/tbb/profiling.cpp create mode 100644 third_party/tbb/profiling.h create mode 100644 third_party/tbb/queuing_mutex.h create mode 100644 third_party/tbb/queuing_rw_mutex.cpp create mode 100644 third_party/tbb/queuing_rw_mutex.h create mode 100644 third_party/tbb/rml_base.h create mode 100644 third_party/tbb/rml_tbb.cpp create mode 100644 third_party/tbb/rml_tbb.h create mode 100644 third_party/tbb/rml_thread_monitor.h create mode 100644 third_party/tbb/rtm_mutex.cpp create mode 100644 third_party/tbb/rtm_rw_mutex.cpp create mode 100644 third_party/tbb/rw_mutex.h create mode 100644 third_party/tbb/scalable_allocator.h create mode 100644 third_party/tbb/scheduler_common.h create mode 100644 third_party/tbb/semaphore.cpp create mode 100644 third_party/tbb/semaphore.h create mode 100644 third_party/tbb/small_object_pool.cpp create mode 100644 third_party/tbb/small_object_pool_impl.h create mode 100644 third_party/tbb/spin_mutex.h create mode 100644 third_party/tbb/spin_rw_mutex.h create mode 100644 third_party/tbb/task.cpp create mode 100644 third_party/tbb/task.h create mode 100644 third_party/tbb/task_arena.h create mode 100644 third_party/tbb/task_dispatcher.cpp create mode 100644 third_party/tbb/task_dispatcher.h create mode 100644 third_party/tbb/task_group.h create mode 100644 third_party/tbb/task_group_context.cpp create mode 100644 third_party/tbb/task_scheduler_observer.h create mode 100644 third_party/tbb/task_stream.h create mode 100644 third_party/tbb/tbb.h create mode 100644 third_party/tbb/tbb.mk create mode 100644 third_party/tbb/tbb.rc create mode 100644 third_party/tbb/tbb_allocator.h create mode 100644 third_party/tbb/tbbmalloc_proxy.h create mode 100644 third_party/tbb/thread_control_monitor.h create mode 100644 third_party/tbb/thread_data.h create mode 100644 third_party/tbb/thread_dispatcher.cpp create mode 100644 third_party/tbb/thread_dispatcher.h create mode 100644 third_party/tbb/thread_dispatcher_client.h create mode 100644 third_party/tbb/thread_request_serializer.cpp create mode 100644 third_party/tbb/thread_request_serializer.h create mode 100644 third_party/tbb/threading_control.cpp create mode 100644 third_party/tbb/threading_control.h create mode 100644 third_party/tbb/threading_control_client.h create mode 100644 third_party/tbb/tick_count.h create mode 100644 third_party/tbb/tls.h create mode 100644 third_party/tbb/version.cpp create mode 100644 third_party/tbb/version.h create mode 100644 third_party/tbb/waiters.h diff --git a/Makefile b/Makefile index d5eba45b3..dda487114 100644 --- a/Makefile +++ b/Makefile @@ -182,6 +182,7 @@ include net/finger/finger.mk include third_party/double-conversion/test/test.mk include third_party/lua/lua.mk include third_party/tr/tr.mk +include third_party/tbb/tbb.mk include third_party/sed/sed.mk include third_party/awk/awk.mk include third_party/hiredis/hiredis.mk diff --git a/third_party/tbb/README.cosmo b/third_party/tbb/README.cosmo new file mode 100644 index 000000000..05e9b3008 --- /dev/null +++ b/third_party/tbb/README.cosmo @@ -0,0 +1,17 @@ +// clang-format off +DESCRIPTION + + oneAPI Threading Building Blocks (oneTBB) + + oneTBB is a flexible C++ library that simplifies the work of adding parallelism to complex applications, + even if you are not a threading expert. + +SOURCE + + https://github.com/oneapi-src/oneTBB + + commit e813596ba3a1bee0ffa06fb66b5e30b7ea801319 + Author: Alexandra + Date: Wed Jun 21 18:46:54 2023 +0200 + + Documentation for std::invoke (#1112) diff --git a/third_party/tbb/address_waiter.cpp b/third_party/tbb/address_waiter.cpp new file mode 100644 index 000000000..0508f06f7 --- /dev/null +++ b/third_party/tbb/address_waiter.cpp @@ -0,0 +1,107 @@ +// clang-format off +/* + Copyright (c) 2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/concurrent_monitor.h" +#include "third_party/tbb/detail/_waitable_atomic.h" + +#include "third_party/libcxx/type_traits" + +namespace tbb { +namespace detail { +namespace r1 { + +struct address_context { + address_context() = default; + + address_context(void* address, std::uintptr_t context) : + my_address(address), my_context(context) + {} + + void* my_address{nullptr}; + std::uintptr_t my_context{0}; +}; + +class address_waiter : public concurrent_monitor_base { + using base_type = concurrent_monitor_base; +public: + using base_type::base_type; + /** per-thread descriptor for concurrent_monitor */ + using thread_context = sleep_node; +}; + +// 1024 is a rough estimate based on two assumptions: +// 1) there are no more than 1000 threads in the application; +// 2) the mutexes are optimized for short critical sections less than a couple of microseconds, +// which is less than 1/1000 of a time slice. +// In the worst case, we have single mutex that is locked and its thread is preempted. +// Therefore, the probability of a collision while taking unrelated mutex is about 1/size of a table. +static constexpr std::size_t num_address_waiters = 2 << 10; +static_assert(std::is_standard_layout::value, + "address_waiter must be with standard layout"); +static address_waiter address_waiter_table[num_address_waiters]; + +void clear_address_waiter_table() { + for (std::size_t i = 0; i < num_address_waiters; ++i) { + address_waiter_table[i].destroy(); + } +} + +static address_waiter& get_address_waiter(void* address) { + std::uintptr_t tag = std::uintptr_t(address); + return address_waiter_table[((tag >> 5) ^ tag) % num_address_waiters]; +} + +void wait_on_address(void* address, d1::delegate_base& predicate, std::uintptr_t context) { + address_waiter& waiter = get_address_waiter(address); + waiter.wait(predicate, address_context{address, context}); +} + +void notify_by_address(void* address, std::uintptr_t target_context) { + address_waiter& waiter = get_address_waiter(address); + + auto predicate = [address, target_context] (address_context ctx) { + return ctx.my_address == address && ctx.my_context == target_context; + }; + + waiter.notify_relaxed(predicate); +} + +void notify_by_address_one(void* address) { + address_waiter& waiter = get_address_waiter(address); + + auto predicate = [address] (address_context ctx) { + return ctx.my_address == address; + }; + + waiter.notify_one_relaxed(predicate); +} + +void notify_by_address_all(void* address) { + address_waiter& waiter = get_address_waiter(address); + + auto predicate = [address] (address_context ctx) { + return ctx.my_address == address; + }; + + waiter.notify_relaxed(predicate); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/allocator.cpp b/third_party/tbb/allocator.cpp new file mode 100644 index 000000000..aec21f80e --- /dev/null +++ b/third_party/tbb/allocator.cpp @@ -0,0 +1,314 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/version.h" + +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/tbb_allocator.h" // Is this OK? +#include "third_party/tbb/cache_aligned_allocator.h" + +#include "third_party/tbb/dynamic_link.h" +#include "third_party/tbb/misc.h" + +#include "third_party/libcxx/cstdlib" + +#ifdef _WIN32 +#include "libc/nt/accounting.h" +#include "libc/nt/automation.h" +#include "libc/nt/console.h" +#include "libc/nt/debug.h" +#include "libc/nt/dll.h" +#include "libc/nt/enum/keyaccess.h" +#include "libc/nt/enum/regtype.h" +#include "libc/nt/errors.h" +#include "libc/nt/events.h" +#include "libc/nt/files.h" +#include "libc/nt/ipc.h" +#include "libc/nt/memory.h" +#include "libc/nt/paint.h" +#include "libc/nt/process.h" +#include "libc/nt/registry.h" +#include "libc/nt/synchronization.h" +#include "libc/nt/thread.h" +#include "libc/nt/windows.h" +#include "libc/nt/winsock.h" +#else +#include "libc/runtime/dlfcn.h" +#endif + +#if (!defined(_WIN32) && !defined(_WIN64)) || defined(__CYGWIN__) +#include "libc/calls/calls.h" +#include "libc/calls/termios.h" +#include "libc/fmt/conv.h" +#include "libc/limits.h" +#include "libc/mem/alg.h" +#include "libc/mem/alloca.h" +#include "libc/mem/mem.h" +#include "libc/runtime/runtime.h" +#include "libc/stdio/dprintf.h" +#include "libc/stdio/rand.h" +#include "libc/stdio/temp.h" +#include "libc/str/str.h" +#include "libc/sysv/consts/exit.h" +#include "third_party/getopt/getopt.h" +#include "third_party/musl/crypt.h" +#include "third_party/musl/rand48.h" // posix_memalign, free +// With glibc, uClibc and musl on Linux and bionic on Android it is safe to use memalign(), as the allocated memory +// can be freed with free(). It is also better to use memalign() since posix_memalign() is just a wrapper on top of +// memalign() and it offers nothing but overhead due to inconvenient interface. This is likely the case with other +// standard libraries as well, and more libraries can be added to the preprocessor check below. Unfortunately, we +// can't detect musl, so we simply enable memalign() on Linux and Android in general. +#if defined(linux) || defined(__linux) || defined(__linux__) || defined(__ANDROID__) +#include "libc/mem/mem.h" // memalign +#define __TBB_USE_MEMALIGN +#else +#define __TBB_USE_POSIX_MEMALIGN +#endif +#elif defined(_MSC_VER) || defined(__MINGW32__) +#include "libc/mem/mem.h" // _aligned_malloc, _aligned_free +#define __TBB_USE_MSVC_ALIGNED_MALLOC +#endif + +#if __TBB_WEAK_SYMBOLS_PRESENT + +#pragma weak scalable_malloc +#pragma weak scalable_free +#pragma weak scalable_aligned_malloc +#pragma weak scalable_aligned_free + +extern "C" { + void* scalable_malloc(std::size_t); + void scalable_free(void*); + void* scalable_aligned_malloc(std::size_t, std::size_t); + void scalable_aligned_free(void*); +} + +#endif /* __TBB_WEAK_SYMBOLS_PRESENT */ + +namespace tbb { +namespace detail { +namespace r1 { + +//! Initialization routine used for first indirect call via allocate_handler. +static void* initialize_allocate_handler(std::size_t size); + +//! Handler for memory allocation +using allocate_handler_type = void* (*)(std::size_t size); +static std::atomic allocate_handler{ &initialize_allocate_handler }; +allocate_handler_type allocate_handler_unsafe = nullptr; + +//! Handler for memory deallocation +static void (*deallocate_handler)(void* pointer) = nullptr; + +//! Initialization routine used for first indirect call via cache_aligned_allocate_handler. +static void* initialize_cache_aligned_allocate_handler(std::size_t n, std::size_t alignment); + +//! Allocates overaligned memory using standard memory allocator. It is used when scalable_allocator is not available. +static void* std_cache_aligned_allocate(std::size_t n, std::size_t alignment); + +//! Deallocates overaligned memory using standard memory allocator. It is used when scalable_allocator is not available. +static void std_cache_aligned_deallocate(void* p); + +//! Handler for padded memory allocation +using cache_aligned_allocate_handler_type = void* (*)(std::size_t n, std::size_t alignment); +static std::atomic cache_aligned_allocate_handler{ &initialize_cache_aligned_allocate_handler }; +cache_aligned_allocate_handler_type cache_aligned_allocate_handler_unsafe = nullptr; + +//! Handler for padded memory deallocation +static void (*cache_aligned_deallocate_handler)(void* p) = nullptr; + +//! Table describing how to link the handlers. +static const dynamic_link_descriptor MallocLinkTable[] = { + DLD(scalable_malloc, allocate_handler_unsafe), + DLD(scalable_free, deallocate_handler), + DLD(scalable_aligned_malloc, cache_aligned_allocate_handler_unsafe), + DLD(scalable_aligned_free, cache_aligned_deallocate_handler), +}; + + +#if TBB_USE_DEBUG +#define DEBUG_SUFFIX "_debug" +#else +#define DEBUG_SUFFIX +#endif /* TBB_USE_DEBUG */ + +// MALLOCLIB_NAME is the name of the oneTBB memory allocator library. +#if _WIN32||_WIN64 +#define MALLOCLIB_NAME "tbbmalloc" DEBUG_SUFFIX ".dll" +#elif __APPLE__ +#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".2.dylib" +#elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX || __ANDROID__ +#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so" +#elif __unix__ // Note that order of these #elif's is important! +#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so.2" +#else +#error Unknown OS +#endif + +//! Initialize the allocation/free handler pointers. +/** Caller is responsible for ensuring this routine is called exactly once. + The routine attempts to dynamically link with the TBB memory allocator. + If that allocator is not found, it links to malloc and free. */ +void initialize_handler_pointers() { + __TBB_ASSERT(allocate_handler == &initialize_allocate_handler, nullptr); + bool success = dynamic_link(MALLOCLIB_NAME, MallocLinkTable, 4); + if(!success) { + // If unsuccessful, set the handlers to the default routines. + // This must be done now, and not before FillDynamicLinks runs, because if other + // threads call the handlers, we want them to go through the DoOneTimeInitializations logic, + // which forces them to wait. + allocate_handler_unsafe = &std::malloc; + deallocate_handler = &std::free; + cache_aligned_allocate_handler_unsafe = &std_cache_aligned_allocate; + cache_aligned_deallocate_handler = &std_cache_aligned_deallocate; + } + + allocate_handler.store(allocate_handler_unsafe, std::memory_order_release); + cache_aligned_allocate_handler.store(cache_aligned_allocate_handler_unsafe, std::memory_order_release); + + PrintExtraVersionInfo( "ALLOCATOR", success?"scalable_malloc":"malloc" ); +} + +static std::once_flag initialization_state; +void initialize_cache_aligned_allocator() { + std::call_once(initialization_state, &initialize_handler_pointers); +} + +//! Executed on very first call through allocate_handler +static void* initialize_allocate_handler(std::size_t size) { + initialize_cache_aligned_allocator(); + __TBB_ASSERT(allocate_handler != &initialize_allocate_handler, nullptr); + return (*allocate_handler)(size); +} + +//! Executed on very first call through cache_aligned_allocate_handler +static void* initialize_cache_aligned_allocate_handler(std::size_t bytes, std::size_t alignment) { + initialize_cache_aligned_allocator(); + __TBB_ASSERT(cache_aligned_allocate_handler != &initialize_cache_aligned_allocate_handler, nullptr); + return (*cache_aligned_allocate_handler)(bytes, alignment); +} + +// TODO: use CPUID to find actual line size, though consider backward compatibility +// nfs - no false sharing +static constexpr std::size_t nfs_size = 128; + +std::size_t __TBB_EXPORTED_FUNC cache_line_size() { + return nfs_size; +} + +void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size) { + const std::size_t cache_line_size = nfs_size; + __TBB_ASSERT(is_power_of_two(cache_line_size), "must be power of two"); + + // Check for overflow + if (size + cache_line_size < size) { + throw_exception(exception_id::bad_alloc); + } + // scalable_aligned_malloc considers zero size request an error, and returns nullptr + if (size == 0) size = 1; + + void* result = cache_aligned_allocate_handler.load(std::memory_order_acquire)(size, cache_line_size); + if (!result) { + throw_exception(exception_id::bad_alloc); + } + __TBB_ASSERT(is_aligned(result, cache_line_size), "The returned address isn't aligned"); + return result; +} + +void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p) { + __TBB_ASSERT(cache_aligned_deallocate_handler, "Initialization has not been yet."); + (*cache_aligned_deallocate_handler)(p); +} + +static void* std_cache_aligned_allocate(std::size_t bytes, std::size_t alignment) { +#if defined(__TBB_USE_MEMALIGN) + return memalign(alignment, bytes); +#elif defined(__TBB_USE_POSIX_MEMALIGN) + void* p = nullptr; + int res = posix_memalign(&p, alignment, bytes); + if (res != 0) + p = nullptr; + return p; +#elif defined(__TBB_USE_MSVC_ALIGNED_MALLOC) + return _aligned_malloc(bytes, alignment); +#else + // TODO: make it common with cache_aligned_resource + std::size_t space = alignment + bytes; + std::uintptr_t base = reinterpret_cast(std::malloc(space)); + if (!base) { + return nullptr; + } + std::uintptr_t result = (base + nfs_size) & ~(nfs_size - 1); + // Round up to the next cache line (align the base address) + __TBB_ASSERT((result - base) >= sizeof(std::uintptr_t), "Cannot store a base pointer to the header"); + __TBB_ASSERT(space - (result - base) >= bytes, "Not enough space for the storage"); + + // Record where block actually starts. + (reinterpret_cast(result))[-1] = base; + return reinterpret_cast(result); +#endif +} + +static void std_cache_aligned_deallocate(void* p) { +#if defined(__TBB_USE_MEMALIGN) || defined(__TBB_USE_POSIX_MEMALIGN) + free(p); +#elif defined(__TBB_USE_MSVC_ALIGNED_MALLOC) + _aligned_free(p); +#else + if (p) { + __TBB_ASSERT(reinterpret_cast(p) >= 0x4096, "attempt to free block not obtained from cache_aligned_allocator"); + // Recover where block actually starts + std::uintptr_t base = (reinterpret_cast(p))[-1]; + __TBB_ASSERT(((base + nfs_size) & ~(nfs_size - 1)) == reinterpret_cast(p), "Incorrect alignment or not allocated by std_cache_aligned_deallocate?"); + std::free(reinterpret_cast(base)); + } +#endif +} + +void* __TBB_EXPORTED_FUNC allocate_memory(std::size_t size) { + void* result = allocate_handler.load(std::memory_order_acquire)(size); + if (!result) { + throw_exception(exception_id::bad_alloc); + } + return result; +} + +void __TBB_EXPORTED_FUNC deallocate_memory(void* p) { + if (p) { + __TBB_ASSERT(deallocate_handler, "Initialization has not been yet."); + (*deallocate_handler)(p); + } +} + +bool __TBB_EXPORTED_FUNC is_tbbmalloc_used() { + auto handler_snapshot = allocate_handler.load(std::memory_order_acquire); + if (handler_snapshot == &initialize_allocate_handler) { + initialize_cache_aligned_allocator(); + } + handler_snapshot = allocate_handler.load(std::memory_order_relaxed); + __TBB_ASSERT(handler_snapshot != &initialize_allocate_handler && deallocate_handler != nullptr, nullptr); + // Cast to void avoids type mismatch errors on some compilers (e.g. __IBMCPP__) + __TBB_ASSERT((reinterpret_cast(handler_snapshot) == reinterpret_cast(&std::malloc)) == (reinterpret_cast(deallocate_handler) == reinterpret_cast(&std::free)), + "Both shim pointers must refer to routines from the same package (either TBB or CRT)"); + return reinterpret_cast(handler_snapshot) == reinterpret_cast(&std::malloc); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/arena.cpp b/third_party/tbb/arena.cpp new file mode 100644 index 000000000..6c290b898 --- /dev/null +++ b/third_party/tbb/arena.cpp @@ -0,0 +1,858 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/task_dispatcher.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/threading_control.h" +#include "third_party/tbb/arena.h" +#include "third_party/tbb/itt_notify.h" +#include "third_party/tbb/semaphore.h" +#include "third_party/tbb/waiters.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/info.h" +#include "third_party/tbb/tbb_allocator.h" + +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/cstring" +#include "third_party/libcxx/functional" + +namespace tbb { +namespace detail { +namespace r1 { + +#if __TBB_ARENA_BINDING +class numa_binding_observer : public tbb::task_scheduler_observer { + binding_handler* my_binding_handler; +public: + numa_binding_observer( d1::task_arena* ta, int num_slots, int numa_id, core_type_id core_type, int max_threads_per_core ) + : task_scheduler_observer(*ta) + , my_binding_handler(construct_binding_handler(num_slots, numa_id, core_type, max_threads_per_core)) + {} + + void on_scheduler_entry( bool ) override { + apply_affinity_mask(my_binding_handler, this_task_arena::current_thread_index()); + } + + void on_scheduler_exit( bool ) override { + restore_affinity_mask(my_binding_handler, this_task_arena::current_thread_index()); + } + + ~numa_binding_observer() override{ + destroy_binding_handler(my_binding_handler); + } +}; + +numa_binding_observer* construct_binding_observer( d1::task_arena* ta, int num_slots, int numa_id, core_type_id core_type, int max_threads_per_core ) { + numa_binding_observer* binding_observer = nullptr; + if ((core_type >= 0 && core_type_count() > 1) || (numa_id >= 0 && numa_node_count() > 1) || max_threads_per_core > 0) { + binding_observer = new(allocate_memory(sizeof(numa_binding_observer))) numa_binding_observer(ta, num_slots, numa_id, core_type, max_threads_per_core); + __TBB_ASSERT(binding_observer, "Failure during NUMA binding observer allocation and construction"); + binding_observer->observe(true); + } + return binding_observer; +} + +void destroy_binding_observer( numa_binding_observer* binding_observer ) { + __TBB_ASSERT(binding_observer, "Trying to deallocate nullptr pointer"); + binding_observer->observe(false); + binding_observer->~numa_binding_observer(); + deallocate_memory(binding_observer); +} +#endif /*!__TBB_ARENA_BINDING*/ + +void arena::on_thread_leaving(unsigned ref_param) { + // + // Implementation of arena destruction synchronization logic contained various + // bugs/flaws at the different stages of its evolution, so below is a detailed + // description of the issues taken into consideration in the framework of the + // current design. + // + // In case of using fire-and-forget tasks (scheduled via task::enqueue()) + // external thread is allowed to leave its arena before all its work is executed, + // and market may temporarily revoke all workers from this arena. Since revoked + // workers never attempt to reset arena state to EMPTY and cancel its request + // to RML for threads, the arena object is destroyed only when both the last + // thread is leaving it and arena's state is EMPTY (that is its external thread + // left and it does not contain any work). + // Thus resetting arena to EMPTY state (as earlier TBB versions did) should not + // be done here (or anywhere else in the external thread to that matter); doing so + // can result either in arena's premature destruction (at least without + // additional costly checks in workers) or in unnecessary arena state changes + // (and ensuing workers migration). + // + // A worker that checks for work presence and transitions arena to the EMPTY + // state (in snapshot taking procedure arena::out_of_work()) updates + // arena::my_pool_state first and only then arena::my_num_workers_requested. + // So the check for work absence must be done against the latter field. + // + // In a time window between decrementing the active threads count and checking + // if there is an outstanding request for workers. New worker thread may arrive, + // finish remaining work, set arena state to empty, and leave decrementing its + // refcount and destroying. Then the current thread will destroy the arena + // the second time. To preclude it a local copy of the outstanding request + // value can be stored before decrementing active threads count. + // + // But this technique may cause two other problem. When the stored request is + // zero, it is possible that arena still has threads and they can generate new + // tasks and thus re-establish non-zero requests. Then all the threads can be + // revoked (as described above) leaving this thread the last one, and causing + // it to destroy non-empty arena. + // + // The other problem takes place when the stored request is non-zero. Another + // thread may complete the work, set arena state to empty, and leave without + // arena destruction before this thread decrements the refcount. This thread + // cannot destroy the arena either. Thus the arena may be "orphaned". + // + // In both cases we cannot dereference arena pointer after the refcount is + // decremented, as our arena may already be destroyed. + // + // If this is the external thread, the market is protected by refcount to it. + // In case of workers market's liveness is ensured by the RML connection + // rundown protocol, according to which the client (i.e. the market) lives + // until RML server notifies it about connection termination, and this + // notification is fired only after all workers return into RML. + // + // Thus if we decremented refcount to zero we ask the market to check arena + // state (including the fact if it is alive) under the lock. + // + + __TBB_ASSERT(my_references.load(std::memory_order_relaxed) >= ref_param, "broken arena reference counter"); + + // When there is no workers someone must free arena, as + // without workers, no one calls out_of_work(). + if (ref_param == ref_external && !my_mandatory_concurrency.test()) { + out_of_work(); + } + + threading_control* tc = my_threading_control; + auto tc_client_snapshot = tc->prepare_client_destruction(my_tc_client); + // Release our reference to sync with destroy_client + unsigned remaining_ref = my_references.fetch_sub(ref_param, std::memory_order_release) - ref_param; + // do not access `this` it might be destroyed already + if (remaining_ref == 0) { + if (tc->try_destroy_client(tc_client_snapshot)) { + // We are requested to destroy ourself + free_arena(); + } + } +} + +std::size_t arena::occupy_free_slot_in_range( thread_data& tls, std::size_t lower, std::size_t upper ) { + if ( lower >= upper ) return out_of_arena; + // Start search for an empty slot from the one we occupied the last time + std::size_t index = tls.my_arena_index; + if ( index < lower || index >= upper ) index = tls.my_random.get() % (upper - lower) + lower; + __TBB_ASSERT( index >= lower && index < upper, nullptr); + // Find a free slot + for ( std::size_t i = index; i < upper; ++i ) + if (my_slots[i].try_occupy()) return i; + for ( std::size_t i = lower; i < index; ++i ) + if (my_slots[i].try_occupy()) return i; + return out_of_arena; +} + +template +std::size_t arena::occupy_free_slot(thread_data& tls) { + // Firstly, external threads try to occupy reserved slots + std::size_t index = as_worker ? out_of_arena : occupy_free_slot_in_range( tls, 0, my_num_reserved_slots ); + if ( index == out_of_arena ) { + // Secondly, all threads try to occupy all non-reserved slots + index = occupy_free_slot_in_range(tls, my_num_reserved_slots, my_num_slots ); + // Likely this arena is already saturated + if ( index == out_of_arena ) + return out_of_arena; + } + + atomic_update( my_limit, (unsigned)(index + 1), std::less() ); + return index; +} + +std::uintptr_t arena::calculate_stealing_threshold() { + stack_anchor_type anchor; + return r1::calculate_stealing_threshold(reinterpret_cast(&anchor), my_threading_control->worker_stack_size()); +} + +void arena::process(thread_data& tls) { + governor::set_thread_data(tls); // TODO: consider moving to create_one_job. + __TBB_ASSERT( is_alive(my_guard), nullptr); + __TBB_ASSERT( my_num_slots >= 1, nullptr); + + std::size_t index = occupy_free_slot(tls); + if (index == out_of_arena) { + on_thread_leaving(ref_worker); + return; + } + + __TBB_ASSERT( index >= my_num_reserved_slots, "Workers cannot occupy reserved slots" ); + tls.attach_arena(*this, index); + // worker thread enters the dispatch loop to look for a work + tls.my_inbox.set_is_idle(true); + if (tls.my_arena_slot->is_task_pool_published()) { + tls.my_inbox.set_is_idle(false); + } + + task_dispatcher& task_disp = tls.my_arena_slot->default_task_dispatcher(); + tls.enter_task_dispatcher(task_disp, calculate_stealing_threshold()); + __TBB_ASSERT(task_disp.can_steal(), nullptr); + + __TBB_ASSERT( !tls.my_last_observer, "There cannot be notified local observers when entering arena" ); + my_observers.notify_entry_observers(tls.my_last_observer, tls.my_is_worker); + + // Waiting on special object tied to this arena + outermost_worker_waiter waiter(*this); + d1::task* t = tls.my_task_dispatcher->local_wait_for_all(nullptr, waiter); + // For purposes of affinity support, the slot's mailbox is considered idle while no thread is + // attached to it. + tls.my_inbox.set_is_idle(true); + + __TBB_ASSERT_EX(t == nullptr, "Outermost worker must not leave dispatch loop with a task"); + __TBB_ASSERT(governor::is_thread_data_set(&tls), nullptr); + __TBB_ASSERT(tls.my_task_dispatcher == &task_disp, nullptr); + + my_observers.notify_exit_observers(tls.my_last_observer, tls.my_is_worker); + tls.my_last_observer = nullptr; + + tls.leave_task_dispatcher(); + + // Arena slot detach (arena may be used in market::process) + // TODO: Consider moving several calls below into a new method(e.g.detach_arena). + tls.my_arena_slot->release(); + tls.my_arena_slot = nullptr; + tls.my_inbox.detach(); + __TBB_ASSERT(tls.my_inbox.is_idle_state(true), nullptr); + __TBB_ASSERT(is_alive(my_guard), nullptr); + + // In contrast to earlier versions of TBB (before 3.0 U5) now it is possible + // that arena may be temporarily left unpopulated by threads. See comments in + // arena::on_thread_leaving() for more details. + on_thread_leaving(ref_worker); + __TBB_ASSERT(tls.my_arena == this, "my_arena is used as a hint when searching the arena to join"); +} + +arena::arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned priority_level) { + __TBB_ASSERT( !my_guard, "improperly allocated arena?" ); + __TBB_ASSERT( sizeof(my_slots[0]) % cache_line_size()==0, "arena::slot size not multiple of cache line size" ); + __TBB_ASSERT( is_aligned(this, cache_line_size()), "arena misaligned" ); + my_threading_control = control; + my_limit = 1; + // Two slots are mandatory: for the external thread, and for 1 worker (required to support starvation resistant tasks). + my_num_slots = num_arena_slots(num_slots, num_reserved_slots); + my_num_reserved_slots = num_reserved_slots; + my_max_num_workers = num_slots-num_reserved_slots; + my_priority_level = priority_level; + my_references = ref_external; // accounts for the external thread + my_observers.my_arena = this; + my_co_cache.init(4 * num_slots); + __TBB_ASSERT ( my_max_num_workers <= my_num_slots, nullptr); + // Initialize the default context. It should be allocated before task_dispatch construction. + my_default_ctx = new (cache_aligned_allocate(sizeof(d1::task_group_context))) + d1::task_group_context{ d1::task_group_context::isolated, d1::task_group_context::fp_settings }; + // Construct slots. Mark internal synchronization elements for the tools. + task_dispatcher* base_td_pointer = reinterpret_cast(my_slots + my_num_slots); + for( unsigned i = 0; i < my_num_slots; ++i ) { + // __TBB_ASSERT( !my_slots[i].my_scheduler && !my_slots[i].task_pool, nullptr); + __TBB_ASSERT( !my_slots[i].task_pool_ptr, nullptr); + __TBB_ASSERT( !my_slots[i].my_task_pool_size, nullptr); + mailbox(i).construct(); + my_slots[i].init_task_streams(i); + my_slots[i].my_default_task_dispatcher = new(base_td_pointer + i) task_dispatcher(this); + my_slots[i].my_is_occupied.store(false, std::memory_order_relaxed); + } + my_fifo_task_stream.initialize(my_num_slots); + my_resume_task_stream.initialize(my_num_slots); +#if __TBB_PREVIEW_CRITICAL_TASKS + my_critical_task_stream.initialize(my_num_slots); +#endif + my_mandatory_requests = 0; +} + +arena& arena::allocate_arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, + unsigned priority_level) +{ + __TBB_ASSERT( sizeof(base_type) + sizeof(arena_slot) == sizeof(arena), "All arena data fields must go to arena_base" ); + __TBB_ASSERT( sizeof(base_type) % cache_line_size() == 0, "arena slots area misaligned: wrong padding" ); + __TBB_ASSERT( sizeof(mail_outbox) == max_nfs_size, "Mailbox padding is wrong" ); + std::size_t n = allocation_size(num_arena_slots(num_slots, num_reserved_slots)); + unsigned char* storage = (unsigned char*)cache_aligned_allocate(n); + // Zero all slots to indicate that they are empty + std::memset( storage, 0, n ); + + return *new( storage + num_arena_slots(num_slots, num_reserved_slots) * sizeof(mail_outbox) ) + arena(control, num_slots, num_reserved_slots, priority_level); +} + +void arena::free_arena () { + __TBB_ASSERT( is_alive(my_guard), nullptr); + __TBB_ASSERT( !my_references.load(std::memory_order_relaxed), "There are threads in the dying arena" ); + __TBB_ASSERT( !my_total_num_workers_requested && !my_num_workers_allotted, "Dying arena requests workers" ); + __TBB_ASSERT( is_empty(), "Inconsistent state of a dying arena" ); +#if __TBB_ARENA_BINDING + if (my_numa_binding_observer != nullptr) { + destroy_binding_observer(my_numa_binding_observer); + my_numa_binding_observer = nullptr; + } +#endif /*__TBB_ARENA_BINDING*/ + poison_value( my_guard ); + for ( unsigned i = 0; i < my_num_slots; ++i ) { + // __TBB_ASSERT( !my_slots[i].my_scheduler, "arena slot is not empty" ); + // TODO: understand the assertion and modify + // __TBB_ASSERT( my_slots[i].task_pool == EmptyTaskPool, nullptr); + __TBB_ASSERT( my_slots[i].head == my_slots[i].tail, nullptr); // TODO: replace by is_quiescent_local_task_pool_empty + my_slots[i].free_task_pool(); + mailbox(i).drain(); + my_slots[i].my_default_task_dispatcher->~task_dispatcher(); + } + __TBB_ASSERT(my_fifo_task_stream.empty(), "Not all enqueued tasks were executed"); + __TBB_ASSERT(my_resume_task_stream.empty(), "Not all enqueued tasks were executed"); + // Cleanup coroutines/schedulers cache + my_co_cache.cleanup(); + my_default_ctx->~task_group_context(); + cache_aligned_deallocate(my_default_ctx); +#if __TBB_PREVIEW_CRITICAL_TASKS + __TBB_ASSERT( my_critical_task_stream.empty(), "Not all critical tasks were executed"); +#endif + // Clear enfources synchronization with observe(false) + my_observers.clear(); + + void* storage = &mailbox(my_num_slots-1); + __TBB_ASSERT( my_references.load(std::memory_order_relaxed) == 0, nullptr); + this->~arena(); +#if TBB_USE_ASSERT > 1 + std::memset( storage, 0, allocation_size(my_num_slots) ); +#endif /* TBB_USE_ASSERT */ + cache_aligned_deallocate( storage ); +} + +bool arena::has_enqueued_tasks() { + return !my_fifo_task_stream.empty(); +} + +void arena::request_workers(int mandatory_delta, int workers_delta, bool wakeup_threads) { + my_threading_control->adjust_demand(my_tc_client, mandatory_delta, workers_delta); + + if (wakeup_threads) { + // Notify all sleeping threads that work has appeared in the arena. + get_waiting_threads_monitor().notify([&] (market_context context) { + return this == context.my_arena_addr; + }); + } +} + +bool arena::has_tasks() { + // TODO: rework it to return at least a hint about where a task was found; better if the task itself. + std::size_t n = my_limit.load(std::memory_order_acquire); + bool tasks_are_available = false; + for (std::size_t k = 0; k < n && !tasks_are_available; ++k) { + tasks_are_available = !my_slots[k].is_empty(); + } + tasks_are_available = tasks_are_available || has_enqueued_tasks() || !my_resume_task_stream.empty(); +#if __TBB_PREVIEW_CRITICAL_TASKS + tasks_are_available = tasks_are_available || !my_critical_task_stream.empty(); +#endif + return tasks_are_available; +} + +void arena::out_of_work() { + // We should try unset my_pool_state first due to keep arena invariants in consistent state + // Otherwise, we might have my_pool_state = false and my_mandatory_concurrency = true that is broken invariant + bool disable_mandatory = my_mandatory_concurrency.try_clear_if([this] { return !has_enqueued_tasks(); }); + bool release_workers = my_pool_state.try_clear_if([this] { return !has_tasks(); }); + + if (disable_mandatory || release_workers) { + int mandatory_delta = disable_mandatory ? -1 : 0; + int workers_delta = release_workers ? -(int)my_max_num_workers : 0; + + if (disable_mandatory && is_arena_workerless()) { + // We had set workers_delta to 1 when enabled mandatory concurrency, so revert it now + workers_delta = -1; + } + request_workers(mandatory_delta, workers_delta); + } +} + +void arena::set_top_priority(bool is_top_priority) { + my_is_top_priority.store(is_top_priority, std::memory_order_relaxed); +} + +bool arena::is_top_priority() const { + return my_is_top_priority.load(std::memory_order_relaxed); +} + +bool arena::try_join() { + if (num_workers_active() < my_num_workers_allotted.load(std::memory_order_relaxed)) { + my_references += arena::ref_worker; + return true; + } + return false; +} + +void arena::set_allotment(unsigned allotment) { + if (my_num_workers_allotted.load(std::memory_order_relaxed) != allotment) { + my_num_workers_allotted.store(allotment, std::memory_order_relaxed); + } +} + +std::pair arena::update_request(int mandatory_delta, int workers_delta) { + __TBB_ASSERT(-1 <= mandatory_delta && mandatory_delta <= 1, nullptr); + + int min_workers_request = 0; + int max_workers_request = 0; + + // Calculate min request + my_mandatory_requests += mandatory_delta; + min_workers_request = my_mandatory_requests > 0 ? 1 : 0; + + // Calculate max request + my_total_num_workers_requested += workers_delta; + // Clamp worker request into interval [0, my_max_num_workers] + max_workers_request = clamp(my_total_num_workers_requested, 0, + min_workers_request > 0 && is_arena_workerless() ? 1 : (int)my_max_num_workers); + + return { min_workers_request, max_workers_request }; +} + +thread_control_monitor& arena::get_waiting_threads_monitor() { + return my_threading_control->get_waiting_threads_monitor(); +} + +void arena::enqueue_task(d1::task& t, d1::task_group_context& ctx, thread_data& td) { + task_group_context_impl::bind_to(ctx, &td); + task_accessor::context(t) = &ctx; + task_accessor::isolation(t) = no_isolation; + my_fifo_task_stream.push( &t, random_lane_selector(td.my_random) ); + advertise_new_work(); +} + +arena& arena::create(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned arena_priority_level) +{ + __TBB_ASSERT(num_slots > 0, NULL); + __TBB_ASSERT(num_reserved_slots <= num_slots, NULL); + // Add public market reference for an external thread/task_arena (that adds an internal reference in exchange). + arena& a = arena::allocate_arena(control, num_slots, num_reserved_slots, arena_priority_level); + a.my_tc_client = control->create_client(a); + // We should not publish arena until all fields are initialized + control->publish_client(a.my_tc_client); + return a; +} + +} // namespace r1 +} // namespace detail +} // namespace tbb + +// Enable task_arena.h +#include "third_party/tbb/task_arena.h" // task_arena_base + +namespace tbb { +namespace detail { +namespace r1 { + +#if TBB_USE_ASSERT +void assert_arena_priority_valid( tbb::task_arena::priority a_priority ) { + bool is_arena_priority_correct = + a_priority == tbb::task_arena::priority::high || + a_priority == tbb::task_arena::priority::normal || + a_priority == tbb::task_arena::priority::low; + __TBB_ASSERT( is_arena_priority_correct, + "Task arena priority should be equal to one of the predefined values." ); +} +#else +void assert_arena_priority_valid( tbb::task_arena::priority ) {} +#endif + +unsigned arena_priority_level( tbb::task_arena::priority a_priority ) { + assert_arena_priority_valid( a_priority ); + return d1::num_priority_levels - unsigned(int(a_priority) / d1::priority_stride); +} + +tbb::task_arena::priority arena_priority( unsigned priority_level ) { + auto priority = tbb::task_arena::priority( + (d1::num_priority_levels - priority_level) * d1::priority_stride + ); + assert_arena_priority_valid( priority ); + return priority; +} + +struct task_arena_impl { + static void initialize(d1::task_arena_base&); + static void terminate(d1::task_arena_base&); + static bool attach(d1::task_arena_base&); + static void execute(d1::task_arena_base&, d1::delegate_base&); + static void wait(d1::task_arena_base&); + static int max_concurrency(const d1::task_arena_base*); + static void enqueue(d1::task&, d1::task_group_context*, d1::task_arena_base*); +}; + +void __TBB_EXPORTED_FUNC initialize(d1::task_arena_base& ta) { + task_arena_impl::initialize(ta); +} +void __TBB_EXPORTED_FUNC terminate(d1::task_arena_base& ta) { + task_arena_impl::terminate(ta); +} +bool __TBB_EXPORTED_FUNC attach(d1::task_arena_base& ta) { + return task_arena_impl::attach(ta); +} +void __TBB_EXPORTED_FUNC execute(d1::task_arena_base& ta, d1::delegate_base& d) { + task_arena_impl::execute(ta, d); +} +void __TBB_EXPORTED_FUNC wait(d1::task_arena_base& ta) { + task_arena_impl::wait(ta); +} + +int __TBB_EXPORTED_FUNC max_concurrency(const d1::task_arena_base* ta) { + return task_arena_impl::max_concurrency(ta); +} + +void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_arena_base* ta) { + task_arena_impl::enqueue(t, nullptr, ta); +} + +void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_group_context& ctx, d1::task_arena_base* ta) { + task_arena_impl::enqueue(t, &ctx, ta); +} + +void task_arena_impl::initialize(d1::task_arena_base& ta) { + // Enforce global market initialization to properly initialize soft limit + (void)governor::get_thread_data(); + if (ta.my_max_concurrency < 1) { +#if __TBB_ARENA_BINDING + d1::constraints arena_constraints = d1::constraints{} + .set_core_type(ta.core_type()) + .set_max_threads_per_core(ta.max_threads_per_core()) + .set_numa_id(ta.my_numa_id); + ta.my_max_concurrency = (int)default_concurrency(arena_constraints); +#else /*!__TBB_ARENA_BINDING*/ + ta.my_max_concurrency = (int)governor::default_num_threads(); +#endif /*!__TBB_ARENA_BINDING*/ + } + + __TBB_ASSERT(ta.my_arena.load(std::memory_order_relaxed) == nullptr, "Arena already initialized"); + unsigned priority_level = arena_priority_level(ta.my_priority); + threading_control* thr_control = threading_control::register_public_reference(); + arena& a = arena::create(thr_control, unsigned(ta.my_max_concurrency), ta.my_num_reserved_slots, priority_level); + ta.my_arena.store(&a, std::memory_order_release); +#if __TBB_ARENA_BINDING + a.my_numa_binding_observer = construct_binding_observer( + static_cast(&ta), a.my_num_slots, ta.my_numa_id, ta.core_type(), ta.max_threads_per_core()); +#endif /*__TBB_ARENA_BINDING*/ +} + +void task_arena_impl::terminate(d1::task_arena_base& ta) { + arena* a = ta.my_arena.load(std::memory_order_relaxed); + assert_pointer_valid(a); + threading_control::unregister_public_reference(/*blocking_terminate=*/false); + a->on_thread_leaving(arena::ref_external); + ta.my_arena.store(nullptr, std::memory_order_relaxed); +} + +bool task_arena_impl::attach(d1::task_arena_base& ta) { + __TBB_ASSERT(!ta.my_arena.load(std::memory_order_relaxed), nullptr); + thread_data* td = governor::get_thread_data_if_initialized(); + if( td && td->my_arena ) { + arena* a = td->my_arena; + // There is an active arena to attach to. + // It's still used by s, so won't be destroyed right away. + __TBB_ASSERT(a->my_references > 0, nullptr); + a->my_references += arena::ref_external; + ta.my_num_reserved_slots = a->my_num_reserved_slots; + ta.my_priority = arena_priority(a->my_priority_level); + ta.my_max_concurrency = ta.my_num_reserved_slots + a->my_max_num_workers; + __TBB_ASSERT(arena::num_arena_slots(ta.my_max_concurrency, ta.my_num_reserved_slots) == a->my_num_slots, nullptr); + ta.my_arena.store(a, std::memory_order_release); + // increases threading_control's ref count for task_arena + threading_control::register_public_reference(); + return true; + } + return false; +} + +void task_arena_impl::enqueue(d1::task& t, d1::task_group_context* c, d1::task_arena_base* ta) { + thread_data* td = governor::get_thread_data(); // thread data is only needed for FastRandom instance + assert_pointer_valid(td, "thread_data pointer should not be null"); + arena* a = ta ? + ta->my_arena.load(std::memory_order_relaxed) + : td->my_arena + ; + assert_pointer_valid(a, "arena pointer should not be null"); + auto* ctx = c ? c : a->my_default_ctx; + assert_pointer_valid(ctx, "context pointer should not be null"); + // Is there a better place for checking the state of ctx? + __TBB_ASSERT(!a->my_default_ctx->is_group_execution_cancelled(), + "The task will not be executed because its task_group_context is cancelled."); + a->enqueue_task(t, *ctx, *td); +} + +class nested_arena_context : no_copy { +public: + nested_arena_context(thread_data& td, arena& nested_arena, std::size_t slot_index) + : m_orig_execute_data_ext(td.my_task_dispatcher->m_execute_data_ext) + { + if (td.my_arena != &nested_arena) { + m_orig_arena = td.my_arena; + m_orig_slot_index = td.my_arena_index; + m_orig_last_observer = td.my_last_observer; + + td.detach_task_dispatcher(); + td.attach_arena(nested_arena, slot_index); + if (td.my_inbox.is_idle_state(true)) + td.my_inbox.set_is_idle(false); + task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher(); + td.enter_task_dispatcher(task_disp, m_orig_execute_data_ext.task_disp->m_stealing_threshold); + + // If the calling thread occupies the slots out of external thread reserve we need to notify the + // market that this arena requires one worker less. + if (td.my_arena_index >= td.my_arena->my_num_reserved_slots) { + td.my_arena->request_workers(/* mandatory_delta = */ 0, /* workers_delta = */ -1); + } + + td.my_last_observer = nullptr; + // The task_arena::execute method considers each calling thread as an external thread. + td.my_arena->my_observers.notify_entry_observers(td.my_last_observer, /* worker*/false); + } + + m_task_dispatcher = td.my_task_dispatcher; + m_orig_fifo_tasks_allowed = m_task_dispatcher->allow_fifo_task(true); + m_orig_critical_task_allowed = m_task_dispatcher->m_properties.critical_task_allowed; + m_task_dispatcher->m_properties.critical_task_allowed = true; + + execution_data_ext& ed_ext = td.my_task_dispatcher->m_execute_data_ext; + ed_ext.context = td.my_arena->my_default_ctx; + ed_ext.original_slot = td.my_arena_index; + ed_ext.affinity_slot = d1::no_slot; + ed_ext.task_disp = td.my_task_dispatcher; + ed_ext.isolation = no_isolation; + + __TBB_ASSERT(td.my_arena_slot, nullptr); + __TBB_ASSERT(td.my_arena_slot->is_occupied(), nullptr); + __TBB_ASSERT(td.my_task_dispatcher, nullptr); + } + ~nested_arena_context() { + thread_data& td = *m_task_dispatcher->m_thread_data; + __TBB_ASSERT(governor::is_thread_data_set(&td), nullptr); + m_task_dispatcher->allow_fifo_task(m_orig_fifo_tasks_allowed); + m_task_dispatcher->m_properties.critical_task_allowed = m_orig_critical_task_allowed; + if (m_orig_arena) { + td.my_arena->my_observers.notify_exit_observers(td.my_last_observer, /*worker*/ false); + td.my_last_observer = m_orig_last_observer; + + // Notify the market that this thread releasing a one slot + // that can be used by a worker thread. + if (td.my_arena_index >= td.my_arena->my_num_reserved_slots) { + td.my_arena->request_workers(/* mandatory_delta = */ 0, /* workers_delta = */ 1); + } + + td.leave_task_dispatcher(); + td.my_arena_slot->release(); + td.my_arena->my_exit_monitors.notify_one(); // do not relax! + + td.attach_arena(*m_orig_arena, m_orig_slot_index); + td.attach_task_dispatcher(*m_orig_execute_data_ext.task_disp); + __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr); + } + td.my_task_dispatcher->m_execute_data_ext = m_orig_execute_data_ext; + } + +private: + execution_data_ext m_orig_execute_data_ext{}; + arena* m_orig_arena{ nullptr }; + observer_proxy* m_orig_last_observer{ nullptr }; + task_dispatcher* m_task_dispatcher{ nullptr }; + unsigned m_orig_slot_index{}; + bool m_orig_fifo_tasks_allowed{}; + bool m_orig_critical_task_allowed{}; +}; + +class delegated_task : public d1::task { + d1::delegate_base& m_delegate; + concurrent_monitor& m_monitor; + d1::wait_context& m_wait_ctx; + std::atomic m_completed; + d1::task* execute(d1::execution_data& ed) override { + const execution_data_ext& ed_ext = static_cast(ed); + execution_data_ext orig_execute_data_ext = ed_ext.task_disp->m_execute_data_ext; + __TBB_ASSERT(&ed_ext.task_disp->m_execute_data_ext == &ed, + "The execute data shall point to the current task dispatcher execute data"); + __TBB_ASSERT(ed_ext.task_disp->m_execute_data_ext.isolation == no_isolation, nullptr); + + ed_ext.task_disp->m_execute_data_ext.context = ed_ext.task_disp->get_thread_data().my_arena->my_default_ctx; + bool fifo_task_allowed = ed_ext.task_disp->allow_fifo_task(true); + try_call([&] { + m_delegate(); + }).on_completion([&] { + ed_ext.task_disp->m_execute_data_ext = orig_execute_data_ext; + ed_ext.task_disp->allow_fifo_task(fifo_task_allowed); + }); + + finalize(); + return nullptr; + } + d1::task* cancel(d1::execution_data&) override { + finalize(); + return nullptr; + } + void finalize() { + m_wait_ctx.release(); // must precede the wakeup + m_monitor.notify([this] (std::uintptr_t ctx) { + return ctx == std::uintptr_t(&m_delegate); + }); // do not relax, it needs a fence! + m_completed.store(true, std::memory_order_release); + } +public: + delegated_task(d1::delegate_base& d, concurrent_monitor& s, d1::wait_context& wo) + : m_delegate(d), m_monitor(s), m_wait_ctx(wo), m_completed{ false }{} + ~delegated_task() override { + // The destructor can be called earlier than the m_monitor is notified + // because the waiting thread can be released after m_wait_ctx.release_wait. + // To close that race we wait for the m_completed signal. + spin_wait_until_eq(m_completed, true); + } +}; + +void task_arena_impl::execute(d1::task_arena_base& ta, d1::delegate_base& d) { + arena* a = ta.my_arena.load(std::memory_order_relaxed); + __TBB_ASSERT(a != nullptr, nullptr); + thread_data* td = governor::get_thread_data(); + + bool same_arena = td->my_arena == a; + std::size_t index1 = td->my_arena_index; + if (!same_arena) { + index1 = a->occupy_free_slot(*td); + if (index1 == arena::out_of_arena) { + concurrent_monitor::thread_context waiter((std::uintptr_t)&d); + d1::wait_context wo(1); + d1::task_group_context exec_context(d1::task_group_context::isolated); + task_group_context_impl::copy_fp_settings(exec_context, *a->my_default_ctx); + + delegated_task dt(d, a->my_exit_monitors, wo); + a->enqueue_task( dt, exec_context, *td); + size_t index2 = arena::out_of_arena; + do { + a->my_exit_monitors.prepare_wait(waiter); + if (!wo.continue_execution()) { + a->my_exit_monitors.cancel_wait(waiter); + break; + } + index2 = a->occupy_free_slot(*td); + if (index2 != arena::out_of_arena) { + a->my_exit_monitors.cancel_wait(waiter); + nested_arena_context scope(*td, *a, index2 ); + r1::wait(wo, exec_context); + __TBB_ASSERT(!exec_context.my_exception.load(std::memory_order_relaxed), nullptr); // exception can be thrown above, not deferred + break; + } + a->my_exit_monitors.commit_wait(waiter); + } while (wo.continue_execution()); + if (index2 == arena::out_of_arena) { + // notify a waiting thread even if this thread did not enter arena, + // in case it was woken by a leaving thread but did not need to enter + a->my_exit_monitors.notify_one(); // do not relax! + } + // process possible exception + auto exception = exec_context.my_exception.load(std::memory_order_acquire); + if (exception) { + __TBB_ASSERT(exec_context.is_group_execution_cancelled(), "The task group context with an exception should be canceled."); + exception->throw_self(); + } + __TBB_ASSERT(governor::is_thread_data_set(td), nullptr); + return; + } // if (index1 == arena::out_of_arena) + } // if (!same_arena) + + context_guard_helper context_guard; + context_guard.set_ctx(a->my_default_ctx); + nested_arena_context scope(*td, *a, index1); +#if _WIN64 + try { +#endif + d(); + __TBB_ASSERT(same_arena || governor::is_thread_data_set(td), nullptr); +#if _WIN64 + } catch (...) { + context_guard.restore_default(); + throw; + } +#endif +} + +void task_arena_impl::wait(d1::task_arena_base& ta) { + arena* a = ta.my_arena.load(std::memory_order_relaxed); + __TBB_ASSERT(a != nullptr, nullptr); + thread_data* td = governor::get_thread_data(); + __TBB_ASSERT_EX(td, "Scheduler is not initialized"); + __TBB_ASSERT(td->my_arena != a || td->my_arena_index == 0, "internal_wait is not supported within a worker context" ); + if (a->my_max_num_workers != 0) { + while (a->num_workers_active() || !a->is_empty()) { + yield(); + } + } +} + +int task_arena_impl::max_concurrency(const d1::task_arena_base *ta) { + arena* a = nullptr; + if( ta ) // for special cases of ta->max_concurrency() + a = ta->my_arena.load(std::memory_order_relaxed); + else if( thread_data* td = governor::get_thread_data_if_initialized() ) + a = td->my_arena; // the current arena if any + + if( a ) { // Get parameters from the arena + __TBB_ASSERT( !ta || ta->my_max_concurrency==1, nullptr); + int mandatory_worker = 0; + if (a->is_arena_workerless() && a->my_num_reserved_slots == 1) { + mandatory_worker = a->my_mandatory_concurrency.test() ? 1 : 0; + } + return a->my_num_reserved_slots + a->my_max_num_workers + mandatory_worker; + } + + if (ta && ta->my_max_concurrency == 1) { + return 1; + } + +#if __TBB_ARENA_BINDING + if (ta) { + d1::constraints arena_constraints = d1::constraints{} + .set_numa_id(ta->my_numa_id) + .set_core_type(ta->core_type()) + .set_max_threads_per_core(ta->max_threads_per_core()); + return (int)default_concurrency(arena_constraints); + } +#endif /*!__TBB_ARENA_BINDING*/ + + __TBB_ASSERT(!ta || ta->my_max_concurrency==d1::task_arena_base::automatic, nullptr); + return int(governor::default_num_threads()); +} + +void isolate_within_arena(d1::delegate_base& d, std::intptr_t isolation) { + // TODO: Decide what to do if the scheduler is not initialized. Is there a use case for it? + thread_data* tls = governor::get_thread_data(); + assert_pointers_valid(tls, tls->my_task_dispatcher); + task_dispatcher* dispatcher = tls->my_task_dispatcher; + isolation_type previous_isolation = dispatcher->m_execute_data_ext.isolation; + try_call([&] { + // We temporarily change the isolation tag of the currently running task. It will be restored in the destructor of the guard. + isolation_type current_isolation = isolation ? isolation : reinterpret_cast(&d); + // Save the current isolation value and set new one + previous_isolation = dispatcher->set_isolation(current_isolation); + // Isolation within this callable + d(); + }).on_completion([&] { + __TBB_ASSERT(governor::get_thread_data()->my_task_dispatcher == dispatcher, nullptr); + dispatcher->set_isolation(previous_isolation); + }); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/arena.h b/third_party/tbb/arena.h new file mode 100644 index 000000000..18c02828e --- /dev/null +++ b/third_party/tbb/arena.h @@ -0,0 +1,511 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_arena_H +#define _TBB_arena_H + +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/cstring" + +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/spin_mutex.h" + +#include "third_party/tbb/scheduler_common.h" +#include "third_party/tbb/intrusive_list.h" +#include "third_party/tbb/task_stream.h" +#include "third_party/tbb/arena_slot.h" +#include "third_party/tbb/rml_tbb.h" +#include "third_party/tbb/mailbox.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/concurrent_monitor.h" +#include "third_party/tbb/observer_proxy.h" +#include "third_party/tbb/thread_control_monitor.h" +#include "third_party/tbb/threading_control_client.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class task_dispatcher; +class task_group_context; +class threading_control; +class allocate_root_with_context_proxy; + +#if __TBB_ARENA_BINDING +class numa_binding_observer; +#endif /*__TBB_ARENA_BINDING*/ + +//! Bounded coroutines cache LIFO ring buffer +class arena_co_cache { + //! Ring buffer storage + task_dispatcher** my_co_scheduler_cache; + //! Current cache index + unsigned my_head; + //! Cache capacity for arena + unsigned my_max_index; + //! Accessor lock for modification operations + tbb::spin_mutex my_co_cache_mutex; + + unsigned next_index() { + return ( my_head == my_max_index ) ? 0 : my_head + 1; + } + + unsigned prev_index() { + return ( my_head == 0 ) ? my_max_index : my_head - 1; + } + + bool internal_empty() { + return my_co_scheduler_cache[prev_index()] == nullptr; + } + + void internal_task_dispatcher_cleanup(task_dispatcher* to_cleanup) { + to_cleanup->~task_dispatcher(); + cache_aligned_deallocate(to_cleanup); + } + +public: + void init(unsigned cache_capacity) { + std::size_t alloc_size = cache_capacity * sizeof(task_dispatcher*); + my_co_scheduler_cache = (task_dispatcher**)cache_aligned_allocate(alloc_size); + std::memset( my_co_scheduler_cache, 0, alloc_size ); + my_head = 0; + my_max_index = cache_capacity - 1; + } + + void cleanup() { + while (task_dispatcher* to_cleanup = pop()) { + internal_task_dispatcher_cleanup(to_cleanup); + } + cache_aligned_deallocate(my_co_scheduler_cache); + } + + //! Insert scheduler to the current available place. + //! Replace an old value, if necessary. + void push(task_dispatcher* s) { + task_dispatcher* to_cleanup = nullptr; + { + tbb::spin_mutex::scoped_lock lock(my_co_cache_mutex); + // Check if we are replacing some existing buffer entrance + if (my_co_scheduler_cache[my_head] != nullptr) { + to_cleanup = my_co_scheduler_cache[my_head]; + } + // Store the cached value + my_co_scheduler_cache[my_head] = s; + // Move head index to the next slot + my_head = next_index(); + } + // Cleanup replaced buffer if any + if (to_cleanup) { + internal_task_dispatcher_cleanup(to_cleanup); + } + } + + //! Get a cached scheduler if any + task_dispatcher* pop() { + tbb::spin_mutex::scoped_lock lock(my_co_cache_mutex); + // No cached coroutine + if (internal_empty()) { + return nullptr; + } + // Move head index to the currently available value + my_head = prev_index(); + // Retrieve the value from the buffer + task_dispatcher* to_return = my_co_scheduler_cache[my_head]; + // Clear the previous entrance value + my_co_scheduler_cache[my_head] = nullptr; + return to_return; + } +}; + +struct stack_anchor_type { + stack_anchor_type() = default; + stack_anchor_type(const stack_anchor_type&) = delete; +}; + +class atomic_flag { + static const std::uintptr_t SET = 1; + static const std::uintptr_t UNSET = 0; + std::atomic my_state{UNSET}; +public: + bool test_and_set() { + std::uintptr_t state = my_state.load(std::memory_order_acquire); + switch (state) { + case SET: + return false; + default: /* busy */ + if (my_state.compare_exchange_strong(state, SET)) { + // We interrupted clear transaction + return false; + } + if (state != UNSET) { + // We lost our epoch + return false; + } + // We are too late but still in the same epoch + __TBB_fallthrough; + case UNSET: + return my_state.compare_exchange_strong(state, SET); + } + } + template + bool try_clear_if(Pred&& pred) { + std::uintptr_t busy = std::uintptr_t(&busy); + std::uintptr_t state = my_state.load(std::memory_order_acquire); + if (state == SET && my_state.compare_exchange_strong(state, busy)) { + if (pred()) { + return my_state.compare_exchange_strong(busy, UNSET); + } + // The result of the next operation is discarded, always false should be returned. + my_state.compare_exchange_strong(busy, SET); + } + return false; + } + void clear() { + my_state.store(UNSET, std::memory_order_release); + } + bool test(std::memory_order order = std::memory_order_acquire) { + return my_state.load(order) != UNSET; + } +}; + +//! The structure of an arena, except the array of slots. +/** Separated in order to simplify padding. + Intrusive list node base class is used by market to form a list of arenas. **/ +// TODO: Analyze arena_base cache lines placement +struct arena_base : padded { + //! The number of workers that have been marked out by the resource manager to service the arena. + std::atomic my_num_workers_allotted; // heavy use in stealing loop + + //! Reference counter for the arena. + /** Worker and external thread references are counted separately: first several bits are for references + from external thread threads or explicit task_arenas (see arena::ref_external_bits below); + the rest counts the number of workers servicing the arena. */ + std::atomic my_references; // heavy use in stealing loop + + //! The maximal number of currently busy slots. + std::atomic my_limit; // heavy use in stealing loop + + //! Task pool for the tasks scheduled via task::enqueue() method. + /** Such scheduling guarantees eventual execution even if + - new tasks are constantly coming (by extracting scheduled tasks in + relaxed FIFO order); + - the enqueuing thread does not call any of wait_for_all methods. **/ + task_stream my_fifo_task_stream; // heavy use in stealing loop + + //! Task pool for the tasks scheduled via tbb::resume() function. + task_stream my_resume_task_stream; // heavy use in stealing loop + +#if __TBB_PREVIEW_CRITICAL_TASKS + //! Task pool for the tasks with critical property set. + /** Critical tasks are scheduled for execution ahead of other sources (including local task pool + and even bypassed tasks) unless the thread already executes a critical task in an outer + dispatch loop **/ + // used on the hot path of the task dispatch loop + task_stream my_critical_task_stream; +#endif + + //! The total number of workers that are requested from the resource manager. + int my_total_num_workers_requested; + + //! The index in the array of per priority lists of arenas this object is in. + /*const*/ unsigned my_priority_level; + + //! The max priority level of arena in permit manager. + std::atomic my_is_top_priority{false}; + + //! Current task pool state and estimate of available tasks amount. + atomic_flag my_pool_state; + + //! The list of local observers attached to this arena. + observer_list my_observers; + +#if __TBB_ARENA_BINDING + //! Pointer to internal observer that allows to bind threads in arena to certain NUMA node. + numa_binding_observer* my_numa_binding_observer; +#endif /*__TBB_ARENA_BINDING*/ + + // Below are rarely modified members + + threading_control* my_threading_control; + + //! Default task group context. + d1::task_group_context* my_default_ctx; + + //! Waiting object for external threads that cannot join the arena. + concurrent_monitor my_exit_monitors; + + //! Coroutines (task_dispathers) cache buffer + arena_co_cache my_co_cache; + + // arena needs an extra worker despite the arena limit + atomic_flag my_mandatory_concurrency; + // the number of local mandatory concurrency requests + int my_mandatory_requests; + + //! The number of slots in the arena. + unsigned my_num_slots; + //! The number of reserved slots (can be occupied only by external threads). + unsigned my_num_reserved_slots; + //! The number of workers requested by the external thread owning the arena. + unsigned my_max_num_workers; + + threading_control_client my_tc_client; + +#if TBB_USE_ASSERT + //! Used to trap accesses to the object after its destruction. + std::uintptr_t my_guard; +#endif /* TBB_USE_ASSERT */ +}; // struct arena_base + +class arena: public padded +{ +public: + using base_type = padded; + + //! Types of work advertised by advertise_new_work() + enum new_work_type { + work_spawned, + wakeup, + work_enqueued + }; + + //! Constructor + arena(threading_control* control, unsigned max_num_workers, unsigned num_reserved_slots, unsigned priority_level); + + //! Allocate an instance of arena. + static arena& allocate_arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, + unsigned priority_level); + + static arena& create(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned arena_priority_level); + + static int unsigned num_arena_slots ( unsigned num_slots, unsigned num_reserved_slots ) { + return num_reserved_slots == 0 ? num_slots : max(2u, num_slots); + } + + static int allocation_size( unsigned num_slots ) { + return sizeof(base_type) + num_slots * (sizeof(mail_outbox) + sizeof(arena_slot) + sizeof(task_dispatcher)); + } + + //! Get reference to mailbox corresponding to given slot_id + mail_outbox& mailbox( d1::slot_id slot ) { + __TBB_ASSERT( slot != d1::no_slot, "affinity should be specified" ); + + return reinterpret_cast(this)[-(int)(slot+1)]; // cast to 'int' is redundant but left for readability + } + + //! Completes arena shutdown, destructs and deallocates it. + void free_arena(); + + //! The number of least significant bits for external references + static const unsigned ref_external_bits = 12; // up to 4095 external and 1M workers + + //! Reference increment values for externals and workers + static const unsigned ref_external = 1; + static const unsigned ref_worker = 1 << ref_external_bits; + + //! The number of workers active in the arena. + unsigned num_workers_active() const { + return my_references.load(std::memory_order_acquire) >> ref_external_bits; + } + + //! Check if the recall is requested by the market. + bool is_recall_requested() const { + return num_workers_active() > my_num_workers_allotted.load(std::memory_order_relaxed); + } + + void request_workers(int mandatory_delta, int workers_delta, bool wakeup_threads = false); + + //! If necessary, raise a flag that there is new job in arena. + template void advertise_new_work(); + + //! Attempts to steal a task from a randomly chosen arena slot + d1::task* steal_task(unsigned arena_index, FastRandom& frnd, execution_data_ext& ed, isolation_type isolation); + + //! Get a task from a global starvation resistant queue + template + d1::task* get_stream_task(task_stream& stream, unsigned& hint); + +#if __TBB_PREVIEW_CRITICAL_TASKS + //! Tries to find a critical task in global critical task stream + d1::task* get_critical_task(unsigned& hint, isolation_type isolation); +#endif + + //! Check if there is job anywhere in arena. + void out_of_work(); + + //! enqueue a task into starvation-resistance queue + void enqueue_task(d1::task&, d1::task_group_context&, thread_data&); + + //! Registers the worker with the arena and enters TBB scheduler dispatch loop + void process(thread_data&); + + //! Notification that the thread leaves its arena + + void on_thread_leaving(unsigned ref_param); + + //! Check for the presence of enqueued tasks + bool has_enqueued_tasks(); + + //! Check for the presence of any tasks + bool has_tasks(); + + bool is_empty() { return my_pool_state.test() == /* EMPTY */ false; } + + thread_control_monitor& get_waiting_threads_monitor(); + + static const std::size_t out_of_arena = ~size_t(0); + //! Tries to occupy a slot in the arena. On success, returns the slot index; if no slot is available, returns out_of_arena. + template + std::size_t occupy_free_slot(thread_data&); + //! Tries to occupy a slot in the specified range. + std::size_t occupy_free_slot_in_range(thread_data& tls, std::size_t lower, std::size_t upper); + + std::uintptr_t calculate_stealing_threshold(); + + unsigned priority_level() { return my_priority_level; } + + bool has_request() { return my_total_num_workers_requested; } + + unsigned references() const { return my_references.load(std::memory_order_acquire); } + + bool is_arena_workerless() const { return my_max_num_workers == 0; } + + void set_top_priority(bool); + + bool is_top_priority() const; + + bool try_join(); + + void set_allotment(unsigned allotment); + + std::pair update_request(int mandatory_delta, int workers_delta); + + /** Must be the last data field */ + arena_slot my_slots[1]; +}; // class arena + +template +void arena::advertise_new_work() { + bool is_mandatory_needed = false; + bool are_workers_needed = false; + + if (work_type != work_spawned) { + // Local memory fence here and below is required to avoid missed wakeups; see the comment below. + // Starvation resistant tasks require concurrency, so missed wakeups are unacceptable. + atomic_fence_seq_cst(); + } + + if (work_type == work_enqueued && my_num_slots > my_num_reserved_slots) { + is_mandatory_needed = my_mandatory_concurrency.test_and_set(); + } + + // Double-check idiom that, in case of spawning, is deliberately sloppy about memory fences. + // Technically, to avoid missed wakeups, there should be a full memory fence between the point we + // released the task pool (i.e. spawned task) and read the arena's state. However, adding such a + // fence might hurt overall performance more than it helps, because the fence would be executed + // on every task pool release, even when stealing does not occur. Since TBB allows parallelism, + // but never promises parallelism, the missed wakeup is not a correctness problem. + are_workers_needed = my_pool_state.test_and_set(); + + if (is_mandatory_needed || are_workers_needed) { + int mandatory_delta = is_mandatory_needed ? 1 : 0; + int workers_delta = are_workers_needed ? my_max_num_workers : 0; + + if (is_mandatory_needed && is_arena_workerless()) { + // Set workers_delta to 1 to keep arena invariants consistent + workers_delta = 1; + } + + bool wakeup_workers = is_mandatory_needed || are_workers_needed; + request_workers(mandatory_delta, workers_delta, wakeup_workers); + } +} + +inline d1::task* arena::steal_task(unsigned arena_index, FastRandom& frnd, execution_data_ext& ed, isolation_type isolation) { + auto slot_num_limit = my_limit.load(std::memory_order_relaxed); + if (slot_num_limit == 1) { + // No slots to steal from + return nullptr; + } + // Try to steal a task from a random victim. + std::size_t k = frnd.get() % (slot_num_limit - 1); + // The following condition excludes the external thread that might have + // already taken our previous place in the arena from the list . + // of potential victims. But since such a situation can take + // place only in case of significant oversubscription, keeping + // the checks simple seems to be preferable to complicating the code. + if (k >= arena_index) { + ++k; // Adjusts random distribution to exclude self + } + arena_slot* victim = &my_slots[k]; + d1::task **pool = victim->task_pool.load(std::memory_order_relaxed); + d1::task *t = nullptr; + if (pool == EmptyTaskPool || !(t = victim->steal_task(*this, isolation, k))) { + return nullptr; + } + if (task_accessor::is_proxy_task(*t)) { + task_proxy &tp = *(task_proxy*)t; + d1::slot_id slot = tp.slot; + t = tp.extract_task(); + if (!t) { + // Proxy was empty, so it's our responsibility to free it + tp.allocator.delete_object(&tp, ed); + return nullptr; + } + // Note affinity is called for any stolen task (proxy or general) + ed.affinity_slot = slot; + } else { + // Note affinity is called for any stolen task (proxy or general) + ed.affinity_slot = d1::any_slot; + } + // Update task owner thread id to identify stealing + ed.original_slot = k; + return t; +} + +template +inline d1::task* arena::get_stream_task(task_stream& stream, unsigned& hint) { + if (stream.empty()) + return nullptr; + return stream.pop(subsequent_lane_selector(hint)); +} + +#if __TBB_PREVIEW_CRITICAL_TASKS +// Retrieves critical task respecting isolation level, if provided. The rule is: +// 1) If no outer critical task and no isolation => take any critical task +// 2) If working on an outer critical task and no isolation => cannot take any critical task +// 3) If no outer critical task but isolated => respect isolation +// 4) If working on an outer critical task and isolated => respect isolation +// Hint is used to keep some LIFO-ness, start search with the lane that was used during push operation. +inline d1::task* arena::get_critical_task(unsigned& hint, isolation_type isolation) { + if (my_critical_task_stream.empty()) + return nullptr; + + if ( isolation != no_isolation ) { + return my_critical_task_stream.pop_specific( hint, isolation ); + } else { + return my_critical_task_stream.pop(preceding_lane_selector(hint)); + } +} +#endif // __TBB_PREVIEW_CRITICAL_TASKS + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* _TBB_arena_H */ diff --git a/third_party/tbb/arena_slot.cpp b/third_party/tbb/arena_slot.cpp new file mode 100644 index 000000000..8c10cf071 --- /dev/null +++ b/third_party/tbb/arena_slot.cpp @@ -0,0 +1,219 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/arena_slot.h" +#include "third_party/tbb/arena.h" +#include "third_party/tbb/thread_data.h" + +namespace tbb { +namespace detail { +namespace r1 { + +//------------------------------------------------------------------------ +// Arena Slot +//------------------------------------------------------------------------ +d1::task* arena_slot::get_task_impl(size_t T, execution_data_ext& ed, bool& tasks_omitted, isolation_type isolation) { + __TBB_ASSERT(tail.load(std::memory_order_relaxed) <= T || is_local_task_pool_quiescent(), + "Is it safe to get a task at position T?"); + + d1::task* result = task_pool_ptr[T]; + __TBB_ASSERT(!is_poisoned( result ), "The poisoned task is going to be processed"); + + if (!result) { + return nullptr; + } + bool omit = isolation != no_isolation && isolation != task_accessor::isolation(*result); + if (!omit && !task_accessor::is_proxy_task(*result)) { + return result; + } else if (omit) { + tasks_omitted = true; + return nullptr; + } + + task_proxy& tp = static_cast(*result); + d1::slot_id aff_id = tp.slot; + if ( d1::task *t = tp.extract_task() ) { + ed.affinity_slot = aff_id; + return t; + } + // Proxy was empty, so it's our responsibility to free it + tp.allocator.delete_object(&tp, ed); + + if ( tasks_omitted ) { + task_pool_ptr[T] = nullptr; + } + return nullptr; +} + +d1::task* arena_slot::get_task(execution_data_ext& ed, isolation_type isolation) { + __TBB_ASSERT(is_task_pool_published(), nullptr); + // The current task position in the task pool. + std::size_t T0 = tail.load(std::memory_order_relaxed); + // The bounds of available tasks in the task pool. H0 is only used when the head bound is reached. + std::size_t H0 = (std::size_t)-1, T = T0; + d1::task* result = nullptr; + bool task_pool_empty = false; + bool tasks_omitted = false; + do { + __TBB_ASSERT( !result, nullptr ); + // The full fence is required to sync the store of `tail` with the load of `head` (write-read barrier) + T = --tail; + // The acquire load of head is required to guarantee consistency of our task pool + // when a thief rolls back the head. + if ( (std::intptr_t)( head.load(std::memory_order_acquire) ) > (std::intptr_t)T ) { + acquire_task_pool(); + H0 = head.load(std::memory_order_relaxed); + if ( (std::intptr_t)H0 > (std::intptr_t)T ) { + // The thief has not backed off - nothing to grab. + __TBB_ASSERT( H0 == head.load(std::memory_order_relaxed) + && T == tail.load(std::memory_order_relaxed) + && H0 == T + 1, "victim/thief arbitration algorithm failure" ); + reset_task_pool_and_leave(); + // No tasks in the task pool. + task_pool_empty = true; + break; + } else if ( H0 == T ) { + // There is only one task in the task pool. + reset_task_pool_and_leave(); + task_pool_empty = true; + } else { + // Release task pool if there are still some tasks. + // After the release, the tail will be less than T, thus a thief + // will not attempt to get a task at position T. + release_task_pool(); + } + } + result = get_task_impl( T, ed, tasks_omitted, isolation ); + if ( result ) { + poison_pointer( task_pool_ptr[T] ); + break; + } else if ( !tasks_omitted ) { + poison_pointer( task_pool_ptr[T] ); + __TBB_ASSERT( T0 == T+1, nullptr ); + T0 = T; + } + } while ( !result && !task_pool_empty ); + + if ( tasks_omitted ) { + if ( task_pool_empty ) { + // All tasks have been checked. The task pool should be in reset state. + // We just restore the bounds for the available tasks. + // TODO: Does it have sense to move them to the beginning of the task pool? + __TBB_ASSERT( is_quiescent_local_task_pool_reset(), nullptr ); + if ( result ) { + // If we have a task, it should be at H0 position. + __TBB_ASSERT( H0 == T, nullptr ); + ++H0; + } + __TBB_ASSERT( H0 <= T0, nullptr ); + if ( H0 < T0 ) { + // Restore the task pool if there are some tasks. + head.store(H0, std::memory_order_relaxed); + tail.store(T0, std::memory_order_relaxed); + // The release fence is used in publish_task_pool. + publish_task_pool(); + // Synchronize with snapshot as we published some tasks. + ed.task_disp->m_thread_data->my_arena->advertise_new_work(); + } + } else { + // A task has been obtained. We need to make a hole in position T. + __TBB_ASSERT( is_task_pool_published(), nullptr ); + __TBB_ASSERT( result, nullptr ); + task_pool_ptr[T] = nullptr; + tail.store(T0, std::memory_order_release); + // Synchronize with snapshot as we published some tasks. + // TODO: consider some approach not to call wakeup for each time. E.g. check if the tail reached the head. + ed.task_disp->m_thread_data->my_arena->advertise_new_work(); + } + } + + __TBB_ASSERT( (std::intptr_t)tail.load(std::memory_order_relaxed) >= 0, nullptr ); + __TBB_ASSERT( result || tasks_omitted || is_quiescent_local_task_pool_reset(), nullptr ); + return result; +} + +d1::task* arena_slot::steal_task(arena& a, isolation_type isolation, std::size_t slot_index) { + d1::task** victim_pool = lock_task_pool(); + if (!victim_pool) { + return nullptr; + } + d1::task* result = nullptr; + std::size_t H = head.load(std::memory_order_relaxed); // mirror + std::size_t H0 = H; + bool tasks_omitted = false; + do { + // The full fence is required to sync the store of `head` with the load of `tail` (write-read barrier) + H = ++head; + // The acquire load of tail is required to guarantee consistency of victim_pool + // because the owner synchronizes task spawning via tail. + if ((std::intptr_t)H > (std::intptr_t)(tail.load(std::memory_order_acquire))) { + // Stealing attempt failed, deque contents has not been changed by us + head.store( /*dead: H = */ H0, std::memory_order_relaxed ); + __TBB_ASSERT( !result, nullptr ); + goto unlock; + } + result = victim_pool[H-1]; + __TBB_ASSERT( !is_poisoned( result ), nullptr ); + + if (result) { + if (isolation == no_isolation || isolation == task_accessor::isolation(*result)) { + if (!task_accessor::is_proxy_task(*result)) { + break; + } + task_proxy& tp = *static_cast(result); + // If mailed task is likely to be grabbed by its destination thread, skip it. + if (!task_proxy::is_shared(tp.task_and_tag) || !tp.outbox->recipient_is_idle() || a.mailbox(slot_index).recipient_is_idle()) { + break; + } + } + // The task cannot be executed either due to isolation or proxy constraints. + result = nullptr; + tasks_omitted = true; + } else if (!tasks_omitted) { + // Cleanup the task pool from holes until a task is skipped. + __TBB_ASSERT( H0 == H-1, nullptr ); + poison_pointer( victim_pool[H0] ); + H0 = H; + } + } while (!result); + __TBB_ASSERT( result, nullptr ); + + // emit "task was consumed" signal + poison_pointer( victim_pool[H-1] ); + if (tasks_omitted) { + // Some proxies in the task pool have been omitted. Set the stolen task to nullptr. + victim_pool[H-1] = nullptr; + // The release store synchronizes the victim_pool update(the store of nullptr). + head.store( /*dead: H = */ H0, std::memory_order_release ); + } +unlock: + unlock_task_pool(victim_pool); + +#if __TBB_PREFETCHING + __TBB_cl_evict(&victim_slot.head); + __TBB_cl_evict(&victim_slot.tail); +#endif + if (tasks_omitted) { + // Synchronize with snapshot as the head and tail can be bumped which can falsely trigger EMPTY state + a.advertise_new_work(); + } + return result; +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/arena_slot.h b/third_party/tbb/arena_slot.h new file mode 100644 index 000000000..3f18342c5 --- /dev/null +++ b/third_party/tbb/arena_slot.h @@ -0,0 +1,415 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_arena_slot_H +#define _TBB_arena_slot_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_task.h" + +#include "third_party/tbb/cache_aligned_allocator.h" + +#include "third_party/tbb/misc.h" +#include "third_party/tbb/mailbox.h" +#include "third_party/tbb/scheduler_common.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace r1 { + +class arena; +class task_group_context; + +//-------------------------------------------------------------------------------------------------------- +// Arena Slot +//-------------------------------------------------------------------------------------------------------- + +static d1::task** const EmptyTaskPool = nullptr; +static d1::task** const LockedTaskPool = reinterpret_cast(~std::intptr_t(0)); + +struct alignas(max_nfs_size) arena_slot_shared_state { + //! Scheduler of the thread attached to the slot + /** Marks the slot as busy, and is used to iterate through the schedulers belonging to this arena **/ + std::atomic my_is_occupied; + + // Synchronization of access to Task pool + /** Also is used to specify if the slot is empty or locked: + 0 - empty + -1 - locked **/ + std::atomic task_pool; + + //! Index of the first ready task in the deque. + /** Modified by thieves, and by the owner during compaction/reallocation **/ + std::atomic head; +}; + +struct alignas(max_nfs_size) arena_slot_private_state { + //! Hint provided for operations with the container of starvation-resistant tasks. + /** Modified by the owner thread (during these operations). **/ + unsigned hint_for_fifo_stream; + +#if __TBB_PREVIEW_CRITICAL_TASKS + //! Similar to 'hint_for_fifo_stream' but for critical tasks. + unsigned hint_for_critical_stream; +#endif + + //! Similar to 'hint_for_fifo_stream' but for the resume tasks. + unsigned hint_for_resume_stream; + + //! Index of the element following the last ready task in the deque. + /** Modified by the owner thread. **/ + std::atomic tail; + + //! Capacity of the primary task pool (number of elements - pointers to task). + std::size_t my_task_pool_size; + + //! Task pool of the scheduler that owns this slot + // TODO: previously was task**__TBB_atomic, but seems like not accessed on other thread + d1::task** task_pool_ptr; +}; + +class arena_slot : private arena_slot_shared_state, private arena_slot_private_state { + friend class arena; + friend class outermost_worker_waiter; + friend class task_dispatcher; + friend class thread_data; + friend class nested_arena_context; + + //! The original task dispather associated with this slot + task_dispatcher* my_default_task_dispatcher; + +#if TBB_USE_ASSERT + void fill_with_canary_pattern ( std::size_t first, std::size_t last ) { + for ( std::size_t i = first; i < last; ++i ) + poison_pointer(task_pool_ptr[i]); + } +#else + void fill_with_canary_pattern ( size_t, std::size_t ) {} +#endif /* TBB_USE_ASSERT */ + + static constexpr std::size_t min_task_pool_size = 64; + + void allocate_task_pool( std::size_t n ) { + std::size_t byte_size = ((n * sizeof(d1::task*) + max_nfs_size - 1) / max_nfs_size) * max_nfs_size; + my_task_pool_size = byte_size / sizeof(d1::task*); + task_pool_ptr = (d1::task**)cache_aligned_allocate(byte_size); + // No need to clear the fresh deque since valid items are designated by the head and tail members. + // But fill it with a canary pattern in the high vigilance debug mode. + fill_with_canary_pattern( 0, my_task_pool_size ); + } + +public: + //! Deallocate task pool that was allocated by means of allocate_task_pool. + void free_task_pool( ) { + // TODO: understand the assertion and modify + // __TBB_ASSERT( !task_pool /* TODO: == EmptyTaskPool */, nullptr); + if( task_pool_ptr ) { + __TBB_ASSERT( my_task_pool_size, nullptr); + cache_aligned_deallocate( task_pool_ptr ); + task_pool_ptr = nullptr; + my_task_pool_size = 0; + } + } + + //! Get a task from the local pool. + /** Called only by the pool owner. + Returns the pointer to the task or nullptr if a suitable task is not found. + Resets the pool if it is empty. **/ + d1::task* get_task(execution_data_ext&, isolation_type); + + //! Steal task from slot's ready pool + d1::task* steal_task(arena&, isolation_type, std::size_t); + + //! Some thread is now the owner of this slot + void occupy() { + __TBB_ASSERT(!my_is_occupied.load(std::memory_order_relaxed), nullptr); + my_is_occupied.store(true, std::memory_order_release); + } + + //! Try to occupy the slot + bool try_occupy() { + return !is_occupied() && my_is_occupied.exchange(true) == false; + } + + //! Some thread is now the owner of this slot + void release() { + __TBB_ASSERT(my_is_occupied.load(std::memory_order_relaxed), nullptr); + my_is_occupied.store(false, std::memory_order_release); + } + + //! Spawn newly created tasks + void spawn(d1::task& t) { + std::size_t T = prepare_task_pool(1); + __TBB_ASSERT(is_poisoned(task_pool_ptr[T]), nullptr); + task_pool_ptr[T] = &t; + commit_spawned_tasks(T + 1); + if (!is_task_pool_published()) { + publish_task_pool(); + } + } + + bool is_task_pool_published() const { + return task_pool.load(std::memory_order_relaxed) != EmptyTaskPool; + } + + bool is_empty() const { + return task_pool.load(std::memory_order_relaxed) == EmptyTaskPool || + head.load(std::memory_order_relaxed) >= tail.load(std::memory_order_relaxed); + } + + bool is_occupied() const { + return my_is_occupied.load(std::memory_order_relaxed); + } + + task_dispatcher& default_task_dispatcher() { + __TBB_ASSERT(my_default_task_dispatcher != nullptr, nullptr); + return *my_default_task_dispatcher; + } + + void init_task_streams(unsigned h) { + hint_for_fifo_stream = h; +#if __TBB_RESUMABLE_TASKS + hint_for_resume_stream = h; +#endif +#if __TBB_PREVIEW_CRITICAL_TASKS + hint_for_critical_stream = h; +#endif + } + +#if __TBB_PREVIEW_CRITICAL_TASKS + unsigned& critical_hint() { + return hint_for_critical_stream; + } +#endif +private: + //! Get a task from the local pool at specified location T. + /** Returns the pointer to the task or nullptr if the task cannot be executed, + e.g. proxy has been deallocated or isolation constraint is not met. + tasks_omitted tells if some tasks have been omitted. + Called only by the pool owner. The caller should guarantee that the + position T is not available for a thief. **/ + d1::task* get_task_impl(size_t T, execution_data_ext& ed, bool& tasks_omitted, isolation_type isolation); + + //! Makes sure that the task pool can accommodate at least n more elements + /** If necessary relocates existing task pointers or grows the ready task deque. + * Returns (possible updated) tail index (not accounting for n). **/ + std::size_t prepare_task_pool(std::size_t num_tasks) { + std::size_t T = tail.load(std::memory_order_relaxed); // mirror + if ( T + num_tasks <= my_task_pool_size ) { + return T; + } + + std::size_t new_size = num_tasks; + if ( !my_task_pool_size ) { + __TBB_ASSERT( !is_task_pool_published() && is_quiescent_local_task_pool_reset(), nullptr); + __TBB_ASSERT( !task_pool_ptr, nullptr); + if ( num_tasks < min_task_pool_size ) new_size = min_task_pool_size; + allocate_task_pool( new_size ); + return 0; + } + acquire_task_pool(); + std::size_t H = head.load(std::memory_order_relaxed); // mirror + d1::task** new_task_pool = task_pool_ptr; + __TBB_ASSERT( my_task_pool_size >= min_task_pool_size, nullptr); + // Count not skipped tasks. Consider using std::count_if. + for ( std::size_t i = H; i < T; ++i ) + if ( new_task_pool[i] ) ++new_size; + // If the free space at the beginning of the task pool is too short, we + // are likely facing a pathological single-producer-multiple-consumers + // scenario, and thus it's better to expand the task pool + bool allocate = new_size > my_task_pool_size - min_task_pool_size/4; + if ( allocate ) { + // Grow task pool. As this operation is rare, and its cost is asymptotically + // amortizable, we can tolerate new task pool allocation done under the lock. + if ( new_size < 2 * my_task_pool_size ) + new_size = 2 * my_task_pool_size; + allocate_task_pool( new_size ); // updates my_task_pool_size + } + // Filter out skipped tasks. Consider using std::copy_if. + std::size_t T1 = 0; + for ( std::size_t i = H; i < T; ++i ) { + if ( new_task_pool[i] ) { + task_pool_ptr[T1++] = new_task_pool[i]; + } + } + // Deallocate the previous task pool if a new one has been allocated. + if ( allocate ) + cache_aligned_deallocate( new_task_pool ); + else + fill_with_canary_pattern( T1, tail ); + // Publish the new state. + commit_relocated_tasks( T1 ); + // assert_task_pool_valid(); + return T1; + } + + //! Makes newly spawned tasks visible to thieves + void commit_spawned_tasks(std::size_t new_tail) { + __TBB_ASSERT (new_tail <= my_task_pool_size, "task deque end was overwritten"); + // emit "task was released" signal + // Release fence is necessary to make sure that previously stored task pointers + // are visible to thieves. + tail.store(new_tail, std::memory_order_release); + } + + //! Used by workers to enter the task pool + /** Does not lock the task pool in case if arena slot has been successfully grabbed. **/ + void publish_task_pool() { + __TBB_ASSERT ( task_pool == EmptyTaskPool, "someone else grabbed my arena slot?" ); + __TBB_ASSERT ( head.load(std::memory_order_relaxed) < tail.load(std::memory_order_relaxed), + "entering arena without tasks to share" ); + // Release signal on behalf of previously spawned tasks (when this thread was not in arena yet) + task_pool.store(task_pool_ptr, std::memory_order_release ); + } + + //! Locks the local task pool + /** Garbles task_pool for the duration of the lock. Requires correctly set task_pool_ptr. + ATTENTION: This method is mostly the same as generic_scheduler::lock_task_pool(), with + a little different logic of slot state checks (slot is either locked or points + to our task pool). Thus if either of them is changed, consider changing the counterpart as well. **/ + void acquire_task_pool() { + if (!is_task_pool_published()) { + return; // we are not in arena - nothing to lock + } + bool sync_prepare_done = false; + for( atomic_backoff b;;b.pause() ) { +#if TBB_USE_ASSERT + // Local copy of the arena slot task pool pointer is necessary for the next + // assertion to work correctly to exclude asynchronous state transition effect. + d1::task** tp = task_pool.load(std::memory_order_relaxed); + __TBB_ASSERT( tp == LockedTaskPool || tp == task_pool_ptr, "slot ownership corrupt?" ); +#endif + d1::task** expected = task_pool_ptr; + if( task_pool.load(std::memory_order_relaxed) != LockedTaskPool && + task_pool.compare_exchange_strong(expected, LockedTaskPool ) ) { + // We acquired our own slot + break; + } else if( !sync_prepare_done ) { + // Start waiting + sync_prepare_done = true; + } + // Someone else acquired a lock, so pause and do exponential backoff. + } + __TBB_ASSERT( task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "not really acquired task pool" ); + } + + //! Unlocks the local task pool + /** Restores task_pool munged by acquire_task_pool. Requires + correctly set task_pool_ptr. **/ + void release_task_pool() { + if ( !(task_pool.load(std::memory_order_relaxed) != EmptyTaskPool) ) + return; // we are not in arena - nothing to unlock + __TBB_ASSERT( task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "arena slot is not locked" ); + task_pool.store( task_pool_ptr, std::memory_order_release ); + } + + //! Locks victim's task pool, and returns pointer to it. The pointer can be nullptr. + /** Garbles victim_arena_slot->task_pool for the duration of the lock. **/ + d1::task** lock_task_pool() { + d1::task** victim_task_pool; + for ( atomic_backoff backoff;; /*backoff pause embedded in the loop*/) { + victim_task_pool = task_pool.load(std::memory_order_relaxed); + // Microbenchmarks demonstrated that aborting stealing attempt when the + // victim's task pool is locked degrade performance. + // NOTE: Do not use comparison of head and tail indices to check for + // the presence of work in the victim's task pool, as they may give + // incorrect indication because of task pool relocations and resizes. + if (victim_task_pool == EmptyTaskPool) { + break; + } + d1::task** expected = victim_task_pool; + if (victim_task_pool != LockedTaskPool && task_pool.compare_exchange_strong(expected, LockedTaskPool) ) { + // We've locked victim's task pool + break; + } + // Someone else acquired a lock, so pause and do exponential backoff. + backoff.pause(); + } + __TBB_ASSERT(victim_task_pool == EmptyTaskPool || + (task_pool.load(std::memory_order_relaxed) == LockedTaskPool && + victim_task_pool != LockedTaskPool), "not really locked victim's task pool?"); + return victim_task_pool; + } + + //! Unlocks victim's task pool + /** Restores victim_arena_slot->task_pool munged by lock_task_pool. **/ + void unlock_task_pool(d1::task** victim_task_pool) { + __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "victim arena slot is not locked"); + __TBB_ASSERT(victim_task_pool != LockedTaskPool, nullptr); + task_pool.store(victim_task_pool, std::memory_order_release); + } + +#if TBB_USE_ASSERT + bool is_local_task_pool_quiescent() const { + d1::task** tp = task_pool.load(std::memory_order_relaxed); + return tp == EmptyTaskPool || tp == LockedTaskPool; + } + + bool is_quiescent_local_task_pool_empty() const { + __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool is not quiescent"); + return head.load(std::memory_order_relaxed) == tail.load(std::memory_order_relaxed); + } + + bool is_quiescent_local_task_pool_reset() const { + __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool is not quiescent"); + return head.load(std::memory_order_relaxed) == 0 && tail.load(std::memory_order_relaxed) == 0; + } +#endif // TBB_USE_ASSERT + + //! Leave the task pool + /** Leaving task pool automatically releases the task pool if it is locked. **/ + void leave_task_pool() { + __TBB_ASSERT(is_task_pool_published(), "Not in arena"); + // Do not reset my_arena_index. It will be used to (attempt to) re-acquire the slot next time + __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "Task pool must be locked when leaving arena"); + __TBB_ASSERT(is_quiescent_local_task_pool_empty(), "Cannot leave arena when the task pool is not empty"); + // No release fence is necessary here as this assignment precludes external + // accesses to the local task pool when becomes visible. Thus it is harmless + // if it gets hoisted above preceding local bookkeeping manipulations. + task_pool.store(EmptyTaskPool, std::memory_order_relaxed); + } + + //! Resets head and tail indices to 0, and leaves task pool + /** The task pool must be locked by the owner (via acquire_task_pool).**/ + void reset_task_pool_and_leave() { + __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "Task pool must be locked when resetting task pool"); + tail.store(0, std::memory_order_relaxed); + head.store(0, std::memory_order_relaxed); + leave_task_pool(); + } + + //! Makes relocated tasks visible to thieves and releases the local task pool. + /** Obviously, the task pool must be locked when calling this method. **/ + void commit_relocated_tasks(std::size_t new_tail) { + __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool must be locked when calling commit_relocated_tasks()"); + head.store(0, std::memory_order_relaxed); + // Tail is updated last to minimize probability of a thread making arena + // snapshot being misguided into thinking that this task pool is empty. + tail.store(new_tail, std::memory_order_release); + release_task_pool(); + } +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_arena_slot_H diff --git a/third_party/tbb/assert_impl.h b/third_party/tbb/assert_impl.h new file mode 100644 index 000000000..c958d3a40 --- /dev/null +++ b/third_party/tbb/assert_impl.h @@ -0,0 +1,98 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_assert_impl_H +#define __TBB_assert_impl_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_utils.h" + +#include "third_party/libcxx/cstdio" +#include "third_party/libcxx/cstdlib" +#include "third_party/libcxx/cstring" +#include "third_party/libcxx/cstdarg" +#if _MSC_VER && _DEBUG +// MISSING #include +#endif + +#include "third_party/libcxx/mutex" + +#if __TBBMALLOC_BUILD +namespace rml { namespace internal { +#else +namespace tbb { +namespace detail { +namespace r1 { +#endif +// TODO: consider extension for formatted error description string +static void assertion_failure_impl(const char* location, int line, const char* expression, const char* comment) { + + std::fprintf(stderr, "Assertion %s failed (located in the %s function, line in file: %d)\n", + expression, location, line); + + if (comment) { + std::fprintf(stderr, "Detailed description: %s\n", comment); + } +#if _MSC_VER && _DEBUG + if (1 == _CrtDbgReport(_CRT_ASSERT, location, line, "tbb_debug.dll", "%s\r\n%s", expression, comment?comment:"")) { + _CrtDbgBreak(); + } else +#endif + { + std::fflush(stderr); + std::abort(); + } +} + +// Do not move the definition into the assertion_failure function because it will require "magic statics". +// It will bring a dependency on C++ runtime on some platforms while assert_impl.h is reused in tbbmalloc +// that should not depend on C++ runtime +static std::atomic assertion_state; + +void __TBB_EXPORTED_FUNC assertion_failure(const char* location, int line, const char* expression, const char* comment) { +#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED + // Workaround for erroneous "unreachable code" during assertion throwing using call_once + #pragma warning (push) + #pragma warning (disable: 4702) +#endif + // We cannot use std::call_once because it brings a dependency on C++ runtime on some platforms + // while assert_impl.h is reused in tbbmalloc that should not depend on C++ runtime + atomic_do_once([&](){ assertion_failure_impl(location, line, expression, comment); }, assertion_state); +#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED + #pragma warning (pop) +#endif +} + +//! Report a runtime warning. +void runtime_warning( const char* format, ... ) { + char str[1024]; std::memset(str, 0, 1024); + va_list args; va_start(args, format); + vsnprintf( str, 1024-1, format, args); + va_end(args); + fprintf(stderr, "TBB Warning: %s\n", str); +} + +#if __TBBMALLOC_BUILD +}} // namespaces rml::internal +#else +} // namespace r1 +} // namespace detail +} // namespace tbb +#endif + +#endif // __TBB_assert_impl_H + diff --git a/third_party/tbb/blocked_range.h b/third_party/tbb/blocked_range.h new file mode 100644 index 000000000..4f3041ab2 --- /dev/null +++ b/third_party/tbb/blocked_range.h @@ -0,0 +1,171 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_blocked_range_H +#define __TBB_blocked_range_H + +#include "third_party/libcxx/cstddef" + +#include "third_party/tbb/detail/_range_common.h" +#include "third_party/tbb/detail/_namespace_injection.h" + +#include "third_party/tbb/version.h" + +namespace tbb { +namespace detail { +namespace d1 { + +/** \page range_req Requirements on range concept + Class \c R implementing the concept of range must define: + - \code R::R( const R& ); \endcode Copy constructor + - \code R::~R(); \endcode Destructor + - \code bool R::is_divisible() const; \endcode True if range can be partitioned into two subranges + - \code bool R::empty() const; \endcode True if range is empty + - \code R::R( R& r, split ); \endcode Split range \c r into two subranges. +**/ + +//! A range over which to iterate. +/** @ingroup algorithms */ +template + __TBB_requires(blocked_range_value) +class blocked_range { +public: + //! Type of a value + /** Called a const_iterator for sake of algorithms that need to treat a blocked_range + as an STL container. */ + using const_iterator = Value; + + //! Type for size of a range + using size_type = std::size_t; + + //! Construct range over half-open interval [begin,end), with the given grainsize. + blocked_range( Value begin_, Value end_, size_type grainsize_=1 ) : + my_end(end_), my_begin(begin_), my_grainsize(grainsize_) + { + __TBB_ASSERT( my_grainsize>0, "grainsize must be positive" ); + } + + //! Beginning of range. + const_iterator begin() const { return my_begin; } + + //! One past last value in range. + const_iterator end() const { return my_end; } + + //! Size of the range + /** Unspecified if end() + __TBB_requires(blocked_range_value && + blocked_range_value) + friend class blocked_range2d; + + template + __TBB_requires(blocked_range_value && + blocked_range_value && + blocked_range_value) + friend class blocked_range3d; + + template + __TBB_requires(blocked_range_value) + friend class blocked_rangeNd_impl; +}; + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::blocked_range; +// Split types +using detail::split; +using detail::proportional_split; +} // namespace v1 + +} // namespace tbb + +#endif /* __TBB_blocked_range_H */ diff --git a/third_party/tbb/blocked_range2d.h b/third_party/tbb/blocked_range2d.h new file mode 100644 index 000000000..e8f3df03e --- /dev/null +++ b/third_party/tbb/blocked_range2d.h @@ -0,0 +1,112 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_blocked_range2d_H +#define __TBB_blocked_range2d_H + +#include "third_party/libcxx/cstddef" + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_range_common.h" + +#include "third_party/tbb/blocked_range.h" + +namespace tbb { +namespace detail { +namespace d1 { + +//! A 2-dimensional range that models the Range concept. +/** @ingroup algorithms */ +template + __TBB_requires(blocked_range_value && + blocked_range_value) +class blocked_range2d { +public: + //! Type for size of an iteration range + using row_range_type = blocked_range; + using col_range_type = blocked_range; + +private: + row_range_type my_rows; + col_range_type my_cols; + +public: + blocked_range2d( RowValue row_begin, RowValue row_end, typename row_range_type::size_type row_grainsize, + ColValue col_begin, ColValue col_end, typename col_range_type::size_type col_grainsize ) : + my_rows(row_begin,row_end,row_grainsize), + my_cols(col_begin,col_end,col_grainsize) + {} + + blocked_range2d( RowValue row_begin, RowValue row_end, + ColValue col_begin, ColValue col_end ) : + my_rows(row_begin,row_end), + my_cols(col_begin,col_end) + {} + + //! True if range is empty + bool empty() const { + // Range is empty if at least one dimension is empty. + return my_rows.empty() || my_cols.empty(); + } + + //! True if range is divisible into two pieces. + bool is_divisible() const { + return my_rows.is_divisible() || my_cols.is_divisible(); + } + + blocked_range2d( blocked_range2d& r, split ) : + my_rows(r.my_rows), + my_cols(r.my_cols) + { + split split_obj; + do_split(r, split_obj); + } + + blocked_range2d( blocked_range2d& r, proportional_split& proportion ) : + my_rows(r.my_rows), + my_cols(r.my_cols) + { + do_split(r, proportion); + } + + //! The rows of the iteration space + const row_range_type& rows() const { return my_rows; } + + //! The columns of the iteration space + const col_range_type& cols() const { return my_cols; } + +private: + template + void do_split( blocked_range2d& r, Split& split_obj ) { + if ( my_rows.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_rows.grainsize()) ) { + my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj); + } else { + my_rows.my_begin = row_range_type::do_split(r.my_rows, split_obj); + } + } +}; + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::blocked_range2d; +} // namespace v1 +} // namespace tbb + +#endif /* __TBB_blocked_range2d_H */ diff --git a/third_party/tbb/blocked_range3d.h b/third_party/tbb/blocked_range3d.h new file mode 100644 index 000000000..dd5e2312f --- /dev/null +++ b/third_party/tbb/blocked_range3d.h @@ -0,0 +1,131 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_blocked_range3d_H +#define __TBB_blocked_range3d_H + +#include "third_party/libcxx/cstddef" + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" + +#include "third_party/tbb/blocked_range.h" + +namespace tbb { +namespace detail { +namespace d1 { + +//! A 3-dimensional range that models the Range concept. +/** @ingroup algorithms */ +template + __TBB_requires(blocked_range_value && + blocked_range_value && + blocked_range_value) +class blocked_range3d { +public: + //! Type for size of an iteration range + using page_range_type = blocked_range; + using row_range_type = blocked_range; + using col_range_type = blocked_range; + +private: + page_range_type my_pages; + row_range_type my_rows; + col_range_type my_cols; + +public: + + blocked_range3d( PageValue page_begin, PageValue page_end, + RowValue row_begin, RowValue row_end, + ColValue col_begin, ColValue col_end ) : + my_pages(page_begin,page_end), + my_rows(row_begin,row_end), + my_cols(col_begin,col_end) + {} + + blocked_range3d( PageValue page_begin, PageValue page_end, typename page_range_type::size_type page_grainsize, + RowValue row_begin, RowValue row_end, typename row_range_type::size_type row_grainsize, + ColValue col_begin, ColValue col_end, typename col_range_type::size_type col_grainsize ) : + my_pages(page_begin,page_end,page_grainsize), + my_rows(row_begin,row_end,row_grainsize), + my_cols(col_begin,col_end,col_grainsize) + {} + + //! True if range is empty + bool empty() const { + // Range is empty if at least one dimension is empty. + return my_pages.empty() || my_rows.empty() || my_cols.empty(); + } + + //! True if range is divisible into two pieces. + bool is_divisible() const { + return my_pages.is_divisible() || my_rows.is_divisible() || my_cols.is_divisible(); + } + + blocked_range3d( blocked_range3d& r, split split_obj ) : + my_pages(r.my_pages), + my_rows(r.my_rows), + my_cols(r.my_cols) + { + do_split(r, split_obj); + } + + blocked_range3d( blocked_range3d& r, proportional_split& proportion ) : + my_pages(r.my_pages), + my_rows(r.my_rows), + my_cols(r.my_cols) + { + do_split(r, proportion); + } + + //! The pages of the iteration space + const page_range_type& pages() const { return my_pages; } + + //! The rows of the iteration space + const row_range_type& rows() const { return my_rows; } + + //! The columns of the iteration space + const col_range_type& cols() const { return my_cols; } + +private: + template + void do_split( blocked_range3d& r, Split& split_obj) { + if ( my_pages.size()*double(my_rows.grainsize()) < my_rows.size()*double(my_pages.grainsize()) ) { + if ( my_rows.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_rows.grainsize()) ) { + my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj); + } else { + my_rows.my_begin = row_range_type::do_split(r.my_rows, split_obj); + } + } else { + if ( my_pages.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_pages.grainsize()) ) { + my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj); + } else { + my_pages.my_begin = page_range_type::do_split(r.my_pages, split_obj); + } + } + } +}; + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::blocked_range3d; +} // namespace v1 +} // namespace tbb + +#endif /* __TBB_blocked_range3d_H */ diff --git a/third_party/tbb/blocked_rangeNd.h b/third_party/tbb/blocked_rangeNd.h new file mode 100644 index 000000000..3b48046de --- /dev/null +++ b/third_party/tbb/blocked_rangeNd.h @@ -0,0 +1,148 @@ +// clang-format off +/* + Copyright (c) 2017-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_blocked_rangeNd_H +#define __TBB_blocked_rangeNd_H + +#if !TBB_PREVIEW_BLOCKED_RANGE_ND + #error Set TBB_PREVIEW_BLOCKED_RANGE_ND to include blocked_rangeNd.h +#endif + +#include "third_party/libcxx/algorithm" // std::any_of +#include "third_party/libcxx/array" +#include "third_party/libcxx/cstddef" +#include "third_party/libcxx/type_traits" // std::is_same, std::enable_if + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_template_helpers.h" // index_sequence, make_index_sequence +#include "third_party/tbb/detail/_range_common.h" + +#include "third_party/tbb/blocked_range.h" + +namespace tbb { +namespace detail { +namespace d1 { + +/* + The blocked_rangeNd_impl uses make_index_sequence to automatically generate a ctor with + exactly N arguments of the type tbb::blocked_range. Such ctor provides an opportunity + to use braced-init-list parameters to initialize each dimension. + Use of parameters, whose representation is a braced-init-list, but they're not + std::initializer_list or a reference to one, produces a non-deduced context + within template argument deduction. + + NOTE: blocked_rangeNd must be exactly a templated alias to the blocked_rangeNd_impl + (and not e.g. a derived class), otherwise it would need to declare its own ctor + facing the same problem that the impl class solves. +*/ + +template> + __TBB_requires(blocked_range_value) +class blocked_rangeNd_impl; + +template + __TBB_requires(blocked_range_value) +class blocked_rangeNd_impl> { +public: + //! Type of a value. + using value_type = Value; + +private: + //! Helper type to construct range with N tbb::blocked_range objects. + template + using dim_type_helper = tbb::blocked_range; + +public: + blocked_rangeNd_impl() = delete; + + //! Constructs N-dimensional range over N half-open intervals each represented as tbb::blocked_range. + blocked_rangeNd_impl(const dim_type_helper&... args) : my_dims{ {args...} } {} + + //! Dimensionality of a range. + static constexpr unsigned int ndims() { return N; } + + //! Range in certain dimension. + const tbb::blocked_range& dim(unsigned int dimension) const { + __TBB_ASSERT(dimension < N, "out of bound"); + return my_dims[dimension]; + } + + //------------------------------------------------------------------------ + // Methods that implement Range concept + //------------------------------------------------------------------------ + + //! True if at least one dimension is empty. + bool empty() const { + return std::any_of(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range& d) { + return d.empty(); + }); + } + + //! True if at least one dimension is divisible. + bool is_divisible() const { + return std::any_of(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range& d) { + return d.is_divisible(); + }); + } + + blocked_rangeNd_impl(blocked_rangeNd_impl& r, proportional_split proportion) : my_dims(r.my_dims) { + do_split(r, proportion); + } + + blocked_rangeNd_impl(blocked_rangeNd_impl& r, split proportion) : my_dims(r.my_dims) { + do_split(r, proportion); + } + +private: + static_assert(N != 0, "zero dimensional blocked_rangeNd can't be constructed"); + + //! Ranges in each dimension. + std::array, N> my_dims; + + template + void do_split(blocked_rangeNd_impl& r, split_type proportion) { + static_assert((std::is_same::value || std::is_same::value), "type of split object is incorrect"); + __TBB_ASSERT(r.is_divisible(), "can't split not divisible range"); + + auto my_it = std::max_element(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range& first, const tbb::blocked_range& second) { + return (first.size() * second.grainsize() < second.size() * first.grainsize()); + }); + + auto r_it = r.my_dims.begin() + (my_it - my_dims.begin()); + + my_it->my_begin = tbb::blocked_range::do_split(*r_it, proportion); + + // (!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin)) equals to + // (my_it->my_begin == r_it->my_end), but we can't use operator== due to Value concept + __TBB_ASSERT(!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin), + "blocked_range has been split incorrectly"); + } +}; + +template +using blocked_rangeNd = blocked_rangeNd_impl; + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::blocked_rangeNd; +} // namespace v1 +} // namespace tbb + +#endif /* __TBB_blocked_rangeNd_H */ + diff --git a/third_party/tbb/cache_aligned_allocator.h b/third_party/tbb/cache_aligned_allocator.h new file mode 100644 index 000000000..0e79b238b --- /dev/null +++ b/third_party/tbb/cache_aligned_allocator.h @@ -0,0 +1,190 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_cache_aligned_allocator_H +#define __TBB_cache_aligned_allocator_H + +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/libcxx/cstdlib" +#include "third_party/libcxx/utility" + +#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT +// MISSING #include +#endif + +namespace tbb { +namespace detail { + +namespace r1 { +TBB_EXPORT void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size); +TBB_EXPORT void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p); +TBB_EXPORT std::size_t __TBB_EXPORTED_FUNC cache_line_size(); +} + +namespace d1 { + +template +class cache_aligned_allocator { +public: + using value_type = T; + using propagate_on_container_move_assignment = std::true_type; + + //! Always defined for TBB containers (supported since C++17 for std containers) + using is_always_equal = std::true_type; + + cache_aligned_allocator() = default; + template cache_aligned_allocator(const cache_aligned_allocator&) noexcept {} + + //! Allocate space for n objects, starting on a cache/sector line. + __TBB_nodiscard T* allocate(std::size_t n) { + return static_cast(r1::cache_aligned_allocate(n * sizeof(value_type))); + } + + //! Free block of memory that starts on a cache line + void deallocate(T* p, std::size_t) { + r1::cache_aligned_deallocate(p); + } + + //! Largest value for which method allocate might succeed. + std::size_t max_size() const noexcept { + return (~std::size_t(0) - r1::cache_line_size()) / sizeof(value_type); + } + +#if TBB_ALLOCATOR_TRAITS_BROKEN + using pointer = value_type*; + using const_pointer = const value_type*; + using reference = value_type&; + using const_reference = const value_type&; + using difference_type = std::ptrdiff_t; + using size_type = std::size_t; + template struct rebind { + using other = cache_aligned_allocator; + }; + template + void construct(U *p, Args&&... args) + { ::new (p) U(std::forward(args)...); } + void destroy(pointer p) { p->~value_type(); } + pointer address(reference x) const { return &x; } + const_pointer address(const_reference x) const { return &x; } +#endif // TBB_ALLOCATOR_TRAITS_BROKEN +}; + +#if TBB_ALLOCATOR_TRAITS_BROKEN + template<> + class cache_aligned_allocator { + public: + using pointer = void*; + using const_pointer = const void*; + using value_type = void; + template struct rebind { + using other = cache_aligned_allocator; + }; + }; +#endif + +template +bool operator==(const cache_aligned_allocator&, const cache_aligned_allocator&) noexcept { return true; } + +#if !__TBB_CPP20_COMPARISONS_PRESENT +template +bool operator!=(const cache_aligned_allocator&, const cache_aligned_allocator&) noexcept { return false; } +#endif + +#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT + +//! C++17 memory resource wrapper to ensure cache line size alignment +class cache_aligned_resource : public std::pmr::memory_resource { +public: + cache_aligned_resource() : cache_aligned_resource(std::pmr::get_default_resource()) {} + explicit cache_aligned_resource(std::pmr::memory_resource* upstream) : m_upstream(upstream) {} + + std::pmr::memory_resource* upstream_resource() const { + return m_upstream; + } + +private: + //! We don't know what memory resource set. Use padding to guarantee alignment + void* do_allocate(std::size_t bytes, std::size_t alignment) override { + // TODO: make it common with tbb_allocator.cpp + std::size_t cache_line_alignment = correct_alignment(alignment); + std::size_t space = correct_size(bytes) + cache_line_alignment; + std::uintptr_t base = reinterpret_cast(m_upstream->allocate(space)); + __TBB_ASSERT(base != 0, "Upstream resource returned nullptr."); + + // Round up to the next cache line (align the base address) + std::uintptr_t result = (base + cache_line_alignment) & ~(cache_line_alignment - 1); + __TBB_ASSERT((result - base) >= sizeof(std::uintptr_t), "Can`t store a base pointer to the header"); + __TBB_ASSERT(space - (result - base) >= bytes, "Not enough space for the storage"); + + // Record where block actually starts. + (reinterpret_cast(result))[-1] = base; + return reinterpret_cast(result); + } + + void do_deallocate(void* ptr, std::size_t bytes, std::size_t alignment) override { + if (ptr) { + // Recover where block actually starts + std::uintptr_t base = (reinterpret_cast(ptr))[-1]; + m_upstream->deallocate(reinterpret_cast(base), correct_size(bytes) + correct_alignment(alignment)); + } + } + + bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { + if (this == &other) { return true; } +#if __TBB_USE_OPTIONAL_RTTI + const cache_aligned_resource* other_res = dynamic_cast(&other); + return other_res && (upstream_resource() == other_res->upstream_resource()); +#else + return false; +#endif + } + + std::size_t correct_alignment(std::size_t alignment) { + __TBB_ASSERT(tbb::detail::is_power_of_two(alignment), "Alignment is not a power of 2"); +#if __TBB_CPP17_HW_INTERFERENCE_SIZE_PRESENT + std::size_t cache_line_size = std::hardware_destructive_interference_size; +#else + std::size_t cache_line_size = r1::cache_line_size(); +#endif + return alignment < cache_line_size ? cache_line_size : alignment; + } + + std::size_t correct_size(std::size_t bytes) { + // To handle the case, when small size requested. There could be not + // enough space to store the original pointer. + return bytes < sizeof(std::uintptr_t) ? sizeof(std::uintptr_t) : bytes; + } + + std::pmr::memory_resource* m_upstream; +}; + +#endif // __TBB_CPP17_MEMORY_RESOURCE_PRESENT + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::cache_aligned_allocator; +#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT +using detail::d1::cache_aligned_resource; +#endif +} // namespace v1 +} // namespace tbb + +#endif /* __TBB_cache_aligned_allocator_H */ + diff --git a/third_party/tbb/cancellation_disseminator.h b/third_party/tbb/cancellation_disseminator.h new file mode 100644 index 000000000..4ec879718 --- /dev/null +++ b/third_party/tbb/cancellation_disseminator.h @@ -0,0 +1,86 @@ +// clang-format off +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_cancellation_disseminator_H +#define _TBB_cancellation_disseminator_H + +#include "third_party/tbb/mutex.h" +#include "third_party/tbb/task_group.h" + +#include "third_party/tbb/intrusive_list.h" +#include "third_party/tbb/thread_data.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class cancellation_disseminator { +public: + //! Finds all contexts affected by the state change and propagates the new state to them. + /* The propagation is relayed to the cancellation_disseminator because tasks created by one + external thread can be passed to and executed by other external threads. This means + that context trees can span several arenas at once and thus state change + propagation cannot be generally localized to one arena only. + */ + bool propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, d1::task_group_context& src, uint32_t new_state) { + if (src.my_may_have_children.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) { + return true; + } + + // The whole propagation algorithm is under the lock in order to ensure correctness + // in case of concurrent state changes at the different levels of the context tree. + threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex); + // TODO: consider to use double-check idiom + if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state) { + // Another thread has concurrently changed the state. Back down. + return false; + } + + // Advance global state propagation epoch + ++the_context_state_propagation_epoch; + // Propagate to all workers and external threads and sync up their local epochs with the global one + // The whole propagation sequence is locked, thus no contention is expected + for (auto& thr_data : my_threads_list) { + thr_data.propagate_task_group_state(mptr_state, src, new_state); + } + + return true; + } + + void register_thread(thread_data& td) { + threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex); + my_threads_list.push_front(td); + } + + void unregister_thread(thread_data& td) { + threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex); + my_threads_list.remove(td); + } + +private: + using thread_data_list_type = intrusive_list; + using threads_list_mutex_type = d1::mutex; + + threads_list_mutex_type my_threads_list_mutex; + thread_data_list_type my_threads_list; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_cancellation_disseminator_H diff --git a/third_party/tbb/co_context.h b/third_party/tbb/co_context.h new file mode 100644 index 000000000..fe1ddaee2 --- /dev/null +++ b/third_party/tbb/co_context.h @@ -0,0 +1,428 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_co_context_H +#define _TBB_co_context_H + +#include "third_party/tbb/detail/_config.h" + +#if __TBB_RESUMABLE_TASKS + +#include "third_party/libcxx/cstddef" +#include "third_party/libcxx/cstdint" + +#if __TBB_RESUMABLE_TASKS_USE_THREADS + +#if _WIN32 || _WIN64 +#include "libc/nt/accounting.h" +#include "libc/nt/automation.h" +#include "libc/nt/console.h" +#include "libc/nt/debug.h" +#include "libc/nt/dll.h" +#include "libc/nt/enum/keyaccess.h" +#include "libc/nt/enum/regtype.h" +#include "libc/nt/errors.h" +#include "libc/nt/events.h" +#include "libc/nt/files.h" +#include "libc/nt/ipc.h" +#include "libc/nt/memory.h" +#include "libc/nt/paint.h" +#include "libc/nt/process.h" +#include "libc/nt/registry.h" +#include "libc/nt/synchronization.h" +#include "libc/nt/thread.h" +#include "libc/nt/windows.h" +#include "libc/nt/winsock.h" +#else +#include "libc/calls/weirdtypes.h" +#include "libc/sysv/consts/clock.h" +#include "libc/thread/thread.h" +#include "libc/thread/thread2.h" +#endif + +#include "third_party/libcxx/condition_variable" +#include "third_party/tbb/governor.h" + +#elif _WIN32 || _WIN64 +#include "libc/nt/accounting.h" +#include "libc/nt/automation.h" +#include "libc/nt/console.h" +#include "libc/nt/debug.h" +#include "libc/nt/dll.h" +#include "libc/nt/enum/keyaccess.h" +#include "libc/nt/enum/regtype.h" +#include "libc/nt/errors.h" +#include "libc/nt/events.h" +#include "libc/nt/files.h" +#include "libc/nt/ipc.h" +#include "libc/nt/memory.h" +#include "libc/nt/paint.h" +#include "libc/nt/process.h" +#include "libc/nt/registry.h" +#include "libc/nt/synchronization.h" +#include "libc/nt/thread.h" +#include "libc/nt/windows.h" +#include "libc/nt/winsock.h" +#else +// ucontext.h API is deprecated since macOS 10.6 +#if __APPLE__ + #if __INTEL_COMPILER + #pragma warning(push) + #pragma warning(disable:1478) + #elif __clang__ + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wdeprecated-declarations" + #endif +#endif // __APPLE__ + +// MISSING #include +#include "libc/calls/calls.h" +#include "libc/calls/weirdtypes.h" +#include "libc/runtime/runtime.h" +#include "libc/sysv/consts/map.h" +#include "libc/sysv/consts/mlock.h" +#include "libc/sysv/consts/msync.h" +#include "libc/sysv/consts/posix.h" +#include "libc/sysv/consts/prot.h" +#include "libc/sysv/consts/madv.h" +#include "libc/sysv/consts/mfd.h" +#include "libc/sysv/consts/mremap.h" // mprotect + +#include "third_party/tbb/governor.h" // default_page_size() + +#ifndef MAP_STACK +// macOS* does not define MAP_STACK +#define MAP_STACK 0 +#endif +#ifndef MAP_ANONYMOUS +// macOS* defines MAP_ANON, which is deprecated in Linux*. +#define MAP_ANONYMOUS MAP_ANON +#endif +#endif // _WIN32 || _WIN64 + +namespace tbb { +namespace detail { +namespace r1 { + +#if __TBB_RESUMABLE_TASKS_USE_THREADS + struct coroutine_type { +#if _WIN32 || _WIN64 + using handle_type = HANDLE; +#else + using handle_type = pthread_t; +#endif + + handle_type my_thread; + std::condition_variable my_condvar; + std::mutex my_mutex; + thread_data* my_thread_data{ nullptr }; + bool my_is_active{ true }; + }; +#elif _WIN32 || _WIN64 + typedef LPVOID coroutine_type; +#else + struct coroutine_type { + coroutine_type() : my_context(), my_stack(), my_stack_size() {} + ucontext_t my_context; + void* my_stack; + std::size_t my_stack_size; + }; +#endif + + // Forward declaration of the coroutine API. + void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg); + void current_coroutine(coroutine_type& c); + void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine); + void destroy_coroutine(coroutine_type& c); + +class co_context { + enum co_state { + co_invalid, + co_suspended, + co_executing, + co_destroyed + }; + coroutine_type my_coroutine; + co_state my_state; + +public: + co_context(std::size_t stack_size, void* arg) + : my_state(stack_size ? co_suspended : co_executing) + { + if (stack_size) { + __TBB_ASSERT(arg != nullptr, nullptr); + create_coroutine(my_coroutine, stack_size, arg); + } else { + current_coroutine(my_coroutine); + } + } + + ~co_context() { + __TBB_ASSERT(1 << my_state & (1 << co_suspended | 1 << co_executing), nullptr); + if (my_state == co_suspended) { +#if __TBB_RESUMABLE_TASKS_USE_THREADS + my_state = co_executing; +#endif + destroy_coroutine(my_coroutine); + } + my_state = co_destroyed; + } + + void resume(co_context& target) { + // Do not create non-trivial objects on the stack of this function. They might never be destroyed. + __TBB_ASSERT(my_state == co_executing, nullptr); + __TBB_ASSERT(target.my_state == co_suspended, nullptr); + + my_state = co_suspended; + target.my_state = co_executing; + + // 'target' can reference an invalid object after swap_coroutine. Do not access it. + swap_coroutine(my_coroutine, target.my_coroutine); + + __TBB_ASSERT(my_state == co_executing, nullptr); + } +}; + +#if _WIN32 || _WIN64 +/* [[noreturn]] */ void __stdcall co_local_wait_for_all(void* arg) noexcept; +#else +/* [[noreturn]] */ void co_local_wait_for_all(unsigned hi, unsigned lo) noexcept; +#endif + +#if __TBB_RESUMABLE_TASKS_USE_THREADS +void handle_perror(int error_code, const char* what); + +inline void check(int error_code, const char* routine) { + if (error_code) { + handle_perror(error_code, routine); + } +} + +using thread_data_t = std::pair; + +#if _WIN32 || _WIN64 +inline unsigned WINAPI coroutine_thread_func(void* d) +#else +inline void* coroutine_thread_func(void* d) +#endif +{ + thread_data_t& data = *static_cast(d); + coroutine_type& c = data.first; + void* arg = data.second; + { + std::unique_lock lock(c.my_mutex); + __TBB_ASSERT(c.my_thread_data == nullptr, nullptr); + c.my_is_active = false; + + // We read the data notify the waiting thread + data.second = nullptr; + c.my_condvar.notify_one(); + + c.my_condvar.wait(lock, [&c] { return c.my_is_active == true; }); + } + __TBB_ASSERT(c.my_thread_data != nullptr, nullptr); + governor::set_thread_data(*c.my_thread_data); + +#if _WIN32 || _WIN64 + co_local_wait_for_all(arg); + + return 0; +#else + std::uintptr_t addr = std::uintptr_t(arg); + unsigned lo = unsigned(addr); + unsigned hi = unsigned(std::uint64_t(addr) >> 32); + __TBB_ASSERT(sizeof(addr) == 8 || hi == 0, nullptr); + + co_local_wait_for_all(hi, lo); + + return nullptr; +#endif +}; + +inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) { + thread_data_t data{ c, arg }; + +#if _WIN32 || _WIN64 + c.my_thread = (HANDLE)_beginthreadex(nullptr, unsigned(stack_size), coroutine_thread_func, &data, STACK_SIZE_PARAM_IS_A_RESERVATION, nullptr); + if (!c.my_thread) { + handle_perror(0, "create_coroutine: _beginthreadex failed\n"); + } +#else + pthread_attr_t s; + check(pthread_attr_init(&s), "pthread_attr_init has failed"); + if (stack_size > 0) { + check(pthread_attr_setstacksize(&s, stack_size), "pthread_attr_setstack_size has failed"); + } + check(pthread_create(&c.my_thread, &s, coroutine_thread_func, &data), "pthread_create has failed"); + check(pthread_attr_destroy(&s), "pthread_attr_destroy has failed"); +#endif + + // Wait for the just created thread to read the data + std::unique_lock lock(c.my_mutex); + c.my_condvar.wait(lock, [&arg] { return arg == nullptr; }); +} + +inline void current_coroutine(coroutine_type& c) { +#if _WIN32 || _WIN64 + c.my_thread = GetCurrentThread(); +#else + c.my_thread = pthread_self(); +#endif +} + +inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) { + thread_data* td = governor::get_thread_data(); + __TBB_ASSERT(prev_coroutine.my_is_active == true, "The current thread should be active"); + + // Detach our state before notification other thread + // (because we might be notified just after other thread notification) + prev_coroutine.my_thread_data = nullptr; + prev_coroutine.my_is_active = false; + governor::clear_thread_data(); + + { + std::unique_lock lock(new_coroutine.my_mutex); + __TBB_ASSERT(new_coroutine.my_is_active == false, "The sleeping thread should not be active"); + __TBB_ASSERT(new_coroutine.my_thread_data == nullptr, "The sleeping thread should not be active"); + + new_coroutine.my_thread_data = td; + new_coroutine.my_is_active = true; + new_coroutine.my_condvar.notify_one(); + } + + std::unique_lock lock(prev_coroutine.my_mutex); + prev_coroutine.my_condvar.wait(lock, [&prev_coroutine] { return prev_coroutine.my_is_active == true; }); + __TBB_ASSERT(governor::get_thread_data() != nullptr, nullptr); + governor::set_thread_data(*prev_coroutine.my_thread_data); +} + +inline void destroy_coroutine(coroutine_type& c) { + { + std::unique_lock lock(c.my_mutex); + __TBB_ASSERT(c.my_thread_data == nullptr, "The sleeping thread should not be active"); + __TBB_ASSERT(c.my_is_active == false, "The sleeping thread should not be active"); + c.my_is_active = true; + c.my_condvar.notify_one(); + } +#if _WIN32 || _WIN64 + WaitForSingleObject(c.my_thread, INFINITE); + CloseHandle(c.my_thread); +#else + check(pthread_join(c.my_thread, nullptr), "pthread_join has failed"); +#endif +} +#elif _WIN32 || _WIN64 +inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) { + __TBB_ASSERT(arg, nullptr); + c = CreateFiber(stack_size, co_local_wait_for_all, arg); + __TBB_ASSERT(c, nullptr); +} + +inline void current_coroutine(coroutine_type& c) { + c = IsThreadAFiber() ? GetCurrentFiber() : + ConvertThreadToFiberEx(nullptr, FIBER_FLAG_FLOAT_SWITCH); + __TBB_ASSERT(c, nullptr); +} + +inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) { + if (!IsThreadAFiber()) { + ConvertThreadToFiberEx(nullptr, FIBER_FLAG_FLOAT_SWITCH); + } + __TBB_ASSERT(new_coroutine, nullptr); + prev_coroutine = GetCurrentFiber(); + __TBB_ASSERT(prev_coroutine, nullptr); + SwitchToFiber(new_coroutine); +} + +inline void destroy_coroutine(coroutine_type& c) { + __TBB_ASSERT(c, nullptr); + DeleteFiber(c); +} +#else // !(_WIN32 || _WIN64) + +inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) { + const std::size_t REG_PAGE_SIZE = governor::default_page_size(); + const std::size_t page_aligned_stack_size = (stack_size + (REG_PAGE_SIZE - 1)) & ~(REG_PAGE_SIZE - 1); + const std::size_t protected_stack_size = page_aligned_stack_size + 2 * REG_PAGE_SIZE; + + // Allocate the stack with protection property + std::uintptr_t stack_ptr = (std::uintptr_t)mmap(nullptr, protected_stack_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + __TBB_ASSERT((void*)stack_ptr != MAP_FAILED, nullptr); + + // Allow read write on our stack (guarded pages are still protected) + int err = mprotect((void*)(stack_ptr + REG_PAGE_SIZE), page_aligned_stack_size, PROT_READ | PROT_WRITE); + __TBB_ASSERT_EX(!err, nullptr); + + // Remember the stack state + c.my_stack = (void*)(stack_ptr + REG_PAGE_SIZE); + c.my_stack_size = page_aligned_stack_size; + + err = getcontext(&c.my_context); + __TBB_ASSERT_EX(!err, nullptr); + + c.my_context.uc_link = nullptr; + // cast to char* to disable FreeBSD clang-3.4.1 'incompatible type' error + c.my_context.uc_stack.ss_sp = (char*)c.my_stack; + c.my_context.uc_stack.ss_size = c.my_stack_size; + c.my_context.uc_stack.ss_flags = 0; + + typedef void(*coroutine_func_t)(); + + std::uintptr_t addr = std::uintptr_t(arg); + unsigned lo = unsigned(addr); + unsigned hi = unsigned(std::uint64_t(addr) >> 32); + __TBB_ASSERT(sizeof(addr) == 8 || hi == 0, nullptr); + + makecontext(&c.my_context, (coroutine_func_t)co_local_wait_for_all, 2, hi, lo); +} + +inline void current_coroutine(coroutine_type& c) { + int err = getcontext(&c.my_context); + __TBB_ASSERT_EX(!err, nullptr); +} + +inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) { + int err = swapcontext(&prev_coroutine.my_context, &new_coroutine.my_context); + __TBB_ASSERT_EX(!err, nullptr); +} + +inline void destroy_coroutine(coroutine_type& c) { + const std::size_t REG_PAGE_SIZE = governor::default_page_size(); + // Free stack memory with guarded pages + munmap((void*)((std::uintptr_t)c.my_stack - REG_PAGE_SIZE), c.my_stack_size + 2 * REG_PAGE_SIZE); + // Clear the stack state afterwards + c.my_stack = nullptr; + c.my_stack_size = 0; +} + +#if __APPLE__ + #if __INTEL_COMPILER + #pragma warning(pop) // 1478 warning + #elif __clang__ + #pragma clang diagnostic pop // "-Wdeprecated-declarations" + #endif +#endif + +#endif // _WIN32 || _WIN64 + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* __TBB_RESUMABLE_TASKS */ + +#endif /* _TBB_co_context_H */ diff --git a/third_party/tbb/collaborative_call_once.h b/third_party/tbb/collaborative_call_once.h new file mode 100644 index 000000000..b154b6f7f --- /dev/null +++ b/third_party/tbb/collaborative_call_once.h @@ -0,0 +1,236 @@ +// clang-format off +/* + Copyright (c) 2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_collaborative_call_once_H +#define __TBB_collaborative_call_once_H + +#include "third_party/tbb/task_arena.h" +#include "third_party/tbb/task_group.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace d1 { + +#if _MSC_VER && !defined(__INTEL_COMPILER) + // Suppress warning: structure was padded due to alignment specifier + #pragma warning (push) + #pragma warning (disable: 4324) +#endif + +constexpr std::uintptr_t collaborative_once_max_references = max_nfs_size; +constexpr std::uintptr_t collaborative_once_references_mask = collaborative_once_max_references-1; + +class alignas(max_nfs_size) collaborative_once_runner : no_copy { + + struct storage_t { + task_arena m_arena{ task_arena::attach{} }; + wait_context m_wait_context{1}; + }; + + std::atomic m_ref_count{0}; + std::atomic m_is_ready{false}; + + // Storage with task_arena and wait_context must be initialized only by winner thread + union { + storage_t m_storage; + }; + + template + void isolated_execute(Fn f) { + auto func = [f] { + f(); + // delegate_base requires bool returning functor while isolate_within_arena ignores the result + return true; + }; + + delegated_function delegate(func); + + r1::isolate_within_arena(delegate, reinterpret_cast(this)); + } + +public: + class lifetime_guard : no_copy { + collaborative_once_runner& m_runner; + public: + lifetime_guard(collaborative_once_runner& r) : m_runner(r) { + m_runner.m_ref_count++; + } + ~lifetime_guard() { + m_runner.m_ref_count--; + } + }; + + collaborative_once_runner() {} + + ~collaborative_once_runner() { + spin_wait_until_eq(m_ref_count, 0, std::memory_order_acquire); + if (m_is_ready.load(std::memory_order_relaxed)) { + m_storage.~storage_t(); + } + } + + std::uintptr_t to_bits() { + return reinterpret_cast(this); + } + + static collaborative_once_runner* from_bits(std::uintptr_t bits) { + __TBB_ASSERT( (bits & collaborative_once_references_mask) == 0, "invalid pointer, last log2(max_nfs_size) bits must be zero" ); + return reinterpret_cast(bits); + } + + template + void run_once(F&& f) { + __TBB_ASSERT(!m_is_ready.load(std::memory_order_relaxed), "storage with task_arena and wait_context is already initialized"); + // Initialize internal state + new(&m_storage) storage_t(); + m_storage.m_arena.execute([&] { + isolated_execute([&] { + task_group_context context{ task_group_context::bound, + task_group_context::default_traits | task_group_context::concurrent_wait }; + + function_stack_task t{ std::forward(f), m_storage.m_wait_context }; + + // Set the ready flag after entering the execute body to prevent + // moonlighting threads from occupying all slots inside the arena. + m_is_ready.store(true, std::memory_order_release); + execute_and_wait(t, context, m_storage.m_wait_context, context); + }); + }); + } + + void assist() noexcept { + // Do not join the arena until the winner thread takes the slot + spin_wait_while_eq(m_is_ready, false); + m_storage.m_arena.execute([&] { + isolated_execute([&] { + // We do not want to get an exception from user functor on moonlighting threads. + // The exception is handled with the winner thread + task_group_context stub_context; + wait(m_storage.m_wait_context, stub_context); + }); + }); + } + +}; + +class collaborative_once_flag : no_copy { + enum state : std::uintptr_t { + uninitialized, + done, +#if TBB_USE_ASSERT + dead +#endif + }; + std::atomic m_state{ state::uninitialized }; + + template + friend void collaborative_call_once(collaborative_once_flag& flag, Fn&& f, Args&&... args); + + void set_completion_state(std::uintptr_t runner_bits, std::uintptr_t desired) { + std::uintptr_t expected = runner_bits; + do { + expected = runner_bits; + // Possible inefficiency: when we start waiting, + // some moonlighting threads might continue coming that will prolong our waiting. + // Fortunately, there are limited number of threads on the system so wait time is limited. + spin_wait_until_eq(m_state, expected); + } while (!m_state.compare_exchange_strong(expected, desired)); + } + + template + void do_collaborative_call_once(Fn&& f) { + std::uintptr_t expected = m_state.load(std::memory_order_acquire); + collaborative_once_runner runner; + + do { + if (expected == state::uninitialized && m_state.compare_exchange_strong(expected, runner.to_bits())) { + // Winner thread + runner.run_once([&] { + try_call([&] { + std::forward(f)(); + }).on_exception([&] { + // Reset the state to uninitialized to allow other threads to try initialization again + set_completion_state(runner.to_bits(), state::uninitialized); + }); + // We successfully executed functor + set_completion_state(runner.to_bits(), state::done); + }); + break; + } else { + // Moonlighting thread: we need to add a reference to the state to prolong runner lifetime. + // However, the maximum number of references are limited with runner alignment. + // So, we use CAS loop and spin_wait to guarantee that references never exceed "max_value". + do { + auto max_value = expected | collaborative_once_references_mask; + expected = spin_wait_while_eq(m_state, max_value); + // "expected > state::done" prevents storing values, when state is uninitialized or done + } while (expected > state::done && !m_state.compare_exchange_strong(expected, expected + 1)); + + if (auto shared_runner = collaborative_once_runner::from_bits(expected & ~collaborative_once_references_mask)) { + collaborative_once_runner::lifetime_guard guard{*shared_runner}; + m_state.fetch_sub(1); + + // The moonlighting threads are not expected to handle exceptions from user functor. + // Therefore, no exception is expected from assist(). + shared_runner->assist(); + } + } + __TBB_ASSERT(m_state.load(std::memory_order_relaxed) != state::dead, + "collaborative_once_flag has been prematurely destroyed"); + } while (expected != state::done); + } + +#if TBB_USE_ASSERT +public: + ~collaborative_once_flag() { + m_state.store(state::dead, std::memory_order_relaxed); + } +#endif +}; + + +template +void collaborative_call_once(collaborative_once_flag& flag, Fn&& fn, Args&&... args) { + __TBB_ASSERT(flag.m_state.load(std::memory_order_relaxed) != collaborative_once_flag::dead, + "collaborative_once_flag has been prematurely destroyed"); + if (flag.m_state.load(std::memory_order_acquire) != collaborative_once_flag::done) { + #if __TBB_GCC_PARAMETER_PACK_IN_LAMBDAS_BROKEN + // Using stored_pack to suppress bug in GCC 4.8 + // with parameter pack expansion in lambda + auto stored_pack = save_pack(std::forward(args)...); + auto func = [&] { call(std::forward(fn), std::move(stored_pack)); }; + #else + auto func = [&] { fn(std::forward(args)...); }; + #endif + flag.do_collaborative_call_once(func); + } +} + +#if _MSC_VER && !defined(__INTEL_COMPILER) + #pragma warning (pop) // 4324 warning +#endif + +} // namespace d1 +} // namespace detail + +using detail::d1::collaborative_call_once; +using detail::d1::collaborative_once_flag; +} // namespace tbb + +#endif // __TBB_collaborative_call_once_H diff --git a/third_party/tbb/combinable.h b/third_party/tbb/combinable.h new file mode 100644 index 000000000..63eaf36e7 --- /dev/null +++ b/third_party/tbb/combinable.h @@ -0,0 +1,70 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_combinable_H +#define __TBB_combinable_H + +#include "third_party/tbb/detail/_namespace_injection.h" + +#include "third_party/tbb/enumerable_thread_specific.h" +#include "third_party/tbb/cache_aligned_allocator.h" + +namespace tbb { +namespace detail { +namespace d1 { +/** \name combinable **/ +//@{ +//! Thread-local storage with optional reduction +/** @ingroup containers */ +template +class combinable { + using my_alloc = typename tbb::cache_aligned_allocator; + using my_ets_type = typename tbb::enumerable_thread_specific; + my_ets_type my_ets; + +public: + combinable() = default; + + template + explicit combinable(Finit _finit) : my_ets(_finit) { } + + void clear() { my_ets.clear(); } + + T& local() { return my_ets.local(); } + + T& local(bool& exists) { return my_ets.local(exists); } + + // combine_func_t has signature T(T,T) or T(const T&, const T&) + template + T combine(CombineFunc f_combine) { return my_ets.combine(f_combine); } + + // combine_func_t has signature void(T) or void(const T&) + template + void combine_each(CombineFunc f_combine) { my_ets.combine_each(f_combine); } +}; + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::combinable; +} // inline namespace v1 + +} // namespace tbb + +#endif /* __TBB_combinable_H */ + diff --git a/third_party/tbb/concurrent_bounded_queue.cpp b/third_party/tbb/concurrent_bounded_queue.cpp new file mode 100644 index 000000000..6608c59a8 --- /dev/null +++ b/third_party/tbb/concurrent_bounded_queue.cpp @@ -0,0 +1,85 @@ +// clang-format off +/* + Copyright (c) 2020-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/concurrent_queue.h" +#include "third_party/tbb/cache_aligned_allocator.h" +#include "third_party/tbb/concurrent_monitor.h" + +namespace tbb { +namespace detail { +namespace r1 { + +static constexpr std::size_t monitors_number = 2; + +std::uint8_t* __TBB_EXPORTED_FUNC allocate_bounded_queue_rep( std::size_t queue_rep_size ) +{ + std::size_t monitors_mem_size = sizeof(concurrent_monitor) * monitors_number; + std::uint8_t* mem = static_cast(cache_aligned_allocate(queue_rep_size + monitors_mem_size)); + + concurrent_monitor* monitors = reinterpret_cast(mem + queue_rep_size); + for (std::size_t i = 0; i < monitors_number; ++i) { + new (monitors + i) concurrent_monitor(); + } + + return mem; +} + +void __TBB_EXPORTED_FUNC deallocate_bounded_queue_rep( std::uint8_t* mem, std::size_t queue_rep_size ) +{ + concurrent_monitor* monitors = reinterpret_cast(mem + queue_rep_size); + for (std::size_t i = 0; i < monitors_number; ++i) { + monitors[i].~concurrent_monitor(); + } + + cache_aligned_deallocate(mem); +} + +void __TBB_EXPORTED_FUNC wait_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag, + std::ptrdiff_t target, d1::delegate_base& predicate ) +{ + __TBB_ASSERT(monitor_tag < monitors_number, nullptr); + concurrent_monitor& monitor = monitors[monitor_tag]; + + monitor.wait([&] { return !predicate(); }, std::uintptr_t(target)); +} + +void __TBB_EXPORTED_FUNC abort_bounded_queue_monitors( concurrent_monitor* monitors ) { + concurrent_monitor& items_avail = monitors[d2::cbq_items_avail_tag]; + concurrent_monitor& slots_avail = monitors[d2::cbq_slots_avail_tag]; + + items_avail.abort_all(); + slots_avail.abort_all(); +} + +struct predicate_leq { + std::size_t my_ticket; + predicate_leq( std::size_t ticket ) : my_ticket(ticket) {} + bool operator() ( std::uintptr_t ticket ) const { return static_cast(ticket) <= my_ticket; } +}; + +void __TBB_EXPORTED_FUNC notify_bounded_queue_monitor( concurrent_monitor* monitors, + std::size_t monitor_tag, std::size_t ticket) +{ + __TBB_ASSERT(monitor_tag < monitors_number, nullptr); + concurrent_monitor& monitor = monitors[monitor_tag]; + monitor.notify(predicate_leq(ticket)); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/concurrent_hash_map.h b/third_party/tbb/concurrent_hash_map.h new file mode 100644 index 000000000..ae1a0e0a2 --- /dev/null +++ b/third_party/tbb/concurrent_hash_map.h @@ -0,0 +1,1665 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_concurrent_hash_map_H +#define __TBB_concurrent_hash_map_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_allocator_traits.h" +#include "third_party/tbb/detail/_containers_helpers.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_hash_compare.h" +#include "third_party/tbb/detail/_range_common.h" +#include "third_party/tbb/tbb_allocator.h" +#include "third_party/tbb/spin_rw_mutex.h" + +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/initializer_list" +#include "third_party/libcxx/tuple" +#include "third_party/libcxx/iterator" +#include "third_party/libcxx/utility" // Need std::pair +#include "third_party/libcxx/cstring" // Need std::memset + +namespace tbb { +namespace detail { +namespace d2 { + +#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS && __TBB_CPP20_CONCEPTS_PRESENT +template +concept ch_map_rw_scoped_lockable = rw_scoped_lockable && + requires(const typename Mutex::scoped_lock& sl) { + { sl.is_writer() } -> std::convertible_to; +}; +#endif + +template +struct hash_map_node_base : no_copy { + using mutex_type = MutexType; + // Scoped lock type for mutex + using scoped_type = typename MutexType::scoped_lock; + // Next node in chain + hash_map_node_base* next; + mutex_type mutex; +}; + +// Incompleteness flag value +static void* const rehash_req_flag = reinterpret_cast(std::size_t(3)); +// Rehashed empty bucket flag +static void* const empty_rehashed_flag = reinterpret_cast(std::size_t(0)); + +template +bool rehash_required( hash_map_node_base* node_ptr ) { + return reinterpret_cast(node_ptr) == rehash_req_flag; +} + +#if TBB_USE_ASSERT +template +bool empty_rehashed( hash_map_node_base* node_ptr ) { + return reinterpret_cast(node_ptr) == empty_rehashed_flag; +} +#endif + +// base class of concurrent_hash_map + +template +class hash_map_base { +public: + using size_type = std::size_t; + using hashcode_type = std::size_t; + using segment_index_type = std::size_t; + using node_base = hash_map_node_base; + + struct bucket : no_copy { + using mutex_type = MutexType; + using scoped_type = typename mutex_type::scoped_lock; + + bucket() : node_list(nullptr) {} + bucket( node_base* ptr ) : node_list(ptr) {} + + mutex_type mutex; + std::atomic node_list; + }; + + using allocator_type = Allocator; + using allocator_traits_type = tbb::detail::allocator_traits; + using bucket_allocator_type = typename allocator_traits_type::template rebind_alloc; + using bucket_allocator_traits = tbb::detail::allocator_traits; + + // Count of segments in the first block + static constexpr size_type embedded_block = 1; + // Count of segments in the first block + static constexpr size_type embedded_buckets = 1 << embedded_block; + // Count of segments in the first block + static constexpr size_type first_block = 8; //including embedded_block. perfect with bucket size 16, so the allocations are power of 4096 + // Size of a pointer / table size + static constexpr size_type pointers_per_table = sizeof(segment_index_type) * 8; // one segment per bit + + using segment_ptr_type = bucket*; + using atomic_segment_type = std::atomic; + using segments_table_type = atomic_segment_type[pointers_per_table]; + + hash_map_base( const allocator_type& alloc ) : my_allocator(alloc), my_mask(embedded_buckets - 1), my_size(0) { + for (size_type i = 0; i != embedded_buckets; ++i) { + my_embedded_segment[i].node_list.store(nullptr, std::memory_order_relaxed); + } + + for (size_type segment_index = 0; segment_index < pointers_per_table; ++segment_index) { + auto argument = segment_index < embedded_block ? my_embedded_segment + segment_base(segment_index) : nullptr; + my_table[segment_index].store(argument, std::memory_order_relaxed); + } + + __TBB_ASSERT( embedded_block <= first_block, "The first block number must include embedded blocks"); + } + + // segment index of given index in the array + static segment_index_type segment_index_of( size_type index ) { + return segment_index_type(tbb::detail::log2( index|1 )); + } + + // the first array index of given segment + static segment_index_type segment_base( segment_index_type k ) { + return (segment_index_type(1) << k & ~segment_index_type(1)); + } + + // segment size except for k == 0 + static size_type segment_size( segment_index_type k ) { + return size_type(1) << k; // fake value for k==0 + } + + // true if ptr is valid pointer + static bool is_valid( void* ptr ) { + return reinterpret_cast(ptr) > uintptr_t(63); + } + + template + void init_buckets_impl( segment_ptr_type ptr, size_type sz, const Args&... args ) { + for (size_type i = 0; i < sz; ++i) { + bucket_allocator_traits::construct(my_allocator, ptr + i, args...); + } + } + + // Initialize buckets + void init_buckets( segment_ptr_type ptr, size_type sz, bool is_initial ) { + if (is_initial) { + init_buckets_impl(ptr, sz); + } else { + init_buckets_impl(ptr, sz, reinterpret_cast(rehash_req_flag)); + } + } + + // Add node n to bucket b + static void add_to_bucket( bucket* b, node_base* n ) { + __TBB_ASSERT(!rehash_required(b->node_list.load(std::memory_order_relaxed)), nullptr); + n->next = b->node_list.load(std::memory_order_relaxed); + b->node_list.store(n, std::memory_order_relaxed); // its under lock and flag is set + } + + const bucket_allocator_type& get_allocator() const { + return my_allocator; + } + + bucket_allocator_type& get_allocator() { + return my_allocator; + } + + // Enable segment + void enable_segment( segment_index_type k, bool is_initial = false ) { + __TBB_ASSERT( k, "Zero segment must be embedded" ); + size_type sz; + __TBB_ASSERT( !is_valid(my_table[k].load(std::memory_order_relaxed)), "Wrong concurrent assignment"); + if (k >= first_block) { + sz = segment_size(k); + segment_ptr_type ptr = nullptr; + try_call( [&] { + ptr = bucket_allocator_traits::allocate(my_allocator, sz); + } ).on_exception( [&] { + my_table[k].store(nullptr, std::memory_order_relaxed); + }); + + __TBB_ASSERT(ptr, nullptr); + init_buckets(ptr, sz, is_initial); + my_table[k].store(ptr, std::memory_order_release); + sz <<= 1;// double it to get entire capacity of the container + } else { // the first block + __TBB_ASSERT( k == embedded_block, "Wrong segment index" ); + sz = segment_size(first_block); + segment_ptr_type ptr = nullptr; + try_call( [&] { + ptr = bucket_allocator_traits::allocate(my_allocator, sz - embedded_buckets); + } ).on_exception( [&] { + my_table[k].store(nullptr, std::memory_order_relaxed); + }); + + __TBB_ASSERT(ptr, nullptr); + init_buckets(ptr, sz - embedded_buckets, is_initial); + ptr -= segment_base(embedded_block); + for(segment_index_type i = embedded_block; i < first_block; i++) // calc the offsets + my_table[i].store(ptr + segment_base(i), std::memory_order_release); + } + my_mask.store(sz-1, std::memory_order_release); + } + + void delete_segment( segment_index_type s ) { + segment_ptr_type buckets_ptr = my_table[s].load(std::memory_order_relaxed); + size_type sz = segment_size( s ? s : 1 ); + + size_type deallocate_size = 0; + + if (s >= first_block) { // the first segment or the next + deallocate_size = sz; + } else if (s == embedded_block && embedded_block != first_block) { + deallocate_size = segment_size(first_block) - embedded_buckets; + } + + for (size_type i = 0; i < deallocate_size; ++i) { + bucket_allocator_traits::destroy(my_allocator, buckets_ptr + i); + } + if (deallocate_size != 0) { + bucket_allocator_traits::deallocate(my_allocator, buckets_ptr, deallocate_size); + } + + if (s >= embedded_block) my_table[s].store(nullptr, std::memory_order_relaxed); + } + + // Get bucket by (masked) hashcode + bucket *get_bucket( hashcode_type h ) const noexcept { + segment_index_type s = segment_index_of( h ); + h -= segment_base(s); + segment_ptr_type seg = my_table[s].load(std::memory_order_acquire); + __TBB_ASSERT( is_valid(seg), "hashcode must be cut by valid mask for allocated segments" ); + return &seg[h]; + } + + // detail serial rehashing helper + void mark_rehashed_levels( hashcode_type h ) noexcept { + segment_index_type s = segment_index_of( h ); + while (segment_ptr_type seg = my_table[++s].load(std::memory_order_relaxed)) + if (rehash_required(seg[h].node_list.load(std::memory_order_relaxed))) { + seg[h].node_list.store(reinterpret_cast(empty_rehashed_flag), std::memory_order_relaxed); + mark_rehashed_levels( h + ((hashcode_type)1<node_list.load(std::memory_order_acquire))) { + return true; + } + } + return false; + } + + // Insert a node and check for load factor. @return segment index to enable. + segment_index_type insert_new_node( bucket *b, node_base *n, hashcode_type mask ) { + size_type sz = ++my_size; // prefix form is to enforce allocation after the first item inserted + add_to_bucket( b, n ); + // check load factor + if( sz >= mask ) { // TODO: add custom load_factor + segment_index_type new_seg = tbb::detail::log2( mask+1 ); //optimized segment_index_of + __TBB_ASSERT( is_valid(my_table[new_seg-1].load(std::memory_order_relaxed)), "new allocations must not publish new mask until segment has allocated"); + static const segment_ptr_type is_allocating = segment_ptr_type(2); + segment_ptr_type disabled = nullptr; + if (!(my_table[new_seg].load(std::memory_order_acquire)) + && my_table[new_seg].compare_exchange_strong(disabled, is_allocating)) + return new_seg; // The value must be processed + } + return 0; + } + + // Prepare enough segments for number of buckets + void reserve(size_type buckets) { + if( !buckets-- ) return; + bool is_initial = !my_size.load(std::memory_order_relaxed); + for (size_type m = my_mask.load(std::memory_order_relaxed); buckets > m; + m = my_mask.load(std::memory_order_relaxed)) + { + enable_segment( segment_index_of( m+1 ), is_initial ); + } + } + + // Swap hash_map_bases + void internal_swap_content(hash_map_base &table) { + using std::swap; + swap_atomics_relaxed(my_mask, table.my_mask); + swap_atomics_relaxed(my_size, table.my_size); + + for(size_type i = 0; i < embedded_buckets; i++) { + auto temp = my_embedded_segment[i].node_list.load(std::memory_order_relaxed); + my_embedded_segment[i].node_list.store(table.my_embedded_segment[i].node_list.load(std::memory_order_relaxed), + std::memory_order_relaxed); + table.my_embedded_segment[i].node_list.store(temp, std::memory_order_relaxed); + } + for(size_type i = embedded_block; i < pointers_per_table; i++) { + auto temp = my_table[i].load(std::memory_order_relaxed); + my_table[i].store(table.my_table[i].load(std::memory_order_relaxed), + std::memory_order_relaxed); + table.my_table[i].store(temp, std::memory_order_relaxed); + } + } + + void internal_move(hash_map_base&& other) { + my_mask.store(other.my_mask.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.my_mask.store(embedded_buckets - 1, std::memory_order_relaxed); + + my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.my_size.store(0, std::memory_order_relaxed); + + for (size_type i = 0; i < embedded_buckets; ++i) { + my_embedded_segment[i].node_list.store(other.my_embedded_segment[i].node_list, std::memory_order_relaxed); + other.my_embedded_segment[i].node_list.store(nullptr, std::memory_order_relaxed); + } + + for (size_type i = embedded_block; i < pointers_per_table; ++i) { + my_table[i].store(other.my_table[i].load(std::memory_order_relaxed), + std::memory_order_relaxed); + other.my_table[i].store(nullptr, std::memory_order_relaxed); + } + } + +protected: + bucket_allocator_type my_allocator; + // Hash mask = sum of allocated segment sizes - 1 + std::atomic my_mask; + // Size of container in stored items + std::atomic my_size; // It must be in separate cache line from my_mask due to performance effects + // Zero segment + bucket my_embedded_segment[embedded_buckets]; + // Segment pointers table. Also prevents false sharing between my_mask and my_size + segments_table_type my_table; +}; + +template +class hash_map_range; + +// Meets requirements of a forward iterator for STL +// Value is either the T or const T type of the container. +template +class hash_map_iterator { + using map_type = Container; + using node = typename Container::node; + using map_base = typename Container::base_type; + using node_base = typename map_base::node_base; + using bucket = typename map_base::bucket; +public: + using value_type = Value; + using size_type = typename Container::size_type; + using difference_type = typename Container::difference_type; + using pointer = value_type*; + using reference = value_type&; + using iterator_category = std::forward_iterator_tag; + + // Construct undefined iterator + hash_map_iterator(): my_map(), my_index(), my_bucket(), my_node() {} + hash_map_iterator( const hash_map_iterator& other ) : + my_map(other.my_map), + my_index(other.my_index), + my_bucket(other.my_bucket), + my_node(other.my_node) + {} + + hash_map_iterator& operator=( const hash_map_iterator& other ) { + my_map = other.my_map; + my_index = other.my_index; + my_bucket = other.my_bucket; + my_node = other.my_node; + return *this; + } + + Value& operator*() const { + __TBB_ASSERT( map_base::is_valid(my_node), "iterator uninitialized or at end of container?" ); + return my_node->value(); + } + + Value* operator->() const {return &operator*();} + + hash_map_iterator& operator++() { + my_node = static_cast( my_node->next ); + if( !my_node ) advance_to_next_bucket(); + return *this; + } + + // Post increment + hash_map_iterator operator++(int) { + hash_map_iterator old(*this); + operator++(); + return old; + } +private: + template + friend bool operator==( const hash_map_iterator& i, const hash_map_iterator& j ); + + template + friend bool operator!=( const hash_map_iterator& i, const hash_map_iterator& j ); + + template + friend ptrdiff_t operator-( const hash_map_iterator& i, const hash_map_iterator& j ); + + template + friend class hash_map_iterator; + + template + friend class hash_map_range; + + void advance_to_next_bucket() { // TODO?: refactor to iterator_base class + size_t k = my_index+1; + __TBB_ASSERT( my_bucket, "advancing an invalid iterator?"); + while (k <= my_map->my_mask.load(std::memory_order_relaxed)) { + // Following test uses 2's-complement wizardry + if( k&(k-2) ) // not the beginning of a segment + ++my_bucket; + else my_bucket = my_map->get_bucket( k ); + node_base *n = my_bucket->node_list.load(std::memory_order_relaxed); + if( map_base::is_valid(n) ) { + my_node = static_cast(n); + my_index = k; + return; + } + ++k; + } + my_bucket = nullptr; my_node = nullptr; my_index = k; // the end + } + + template + __TBB_requires(tbb::detail::hash_compare && + ch_map_rw_scoped_lockable) +#else + > + __TBB_requires(tbb::detail::hash_compare) +#endif + friend class concurrent_hash_map; + + hash_map_iterator( const Container &map, std::size_t index, const bucket *b, node_base *n ) : + my_map(&map), my_index(index), my_bucket(b), my_node(static_cast(n)) + { + if( b && !map_base::is_valid(n) ) + advance_to_next_bucket(); + } + + // concurrent_hash_map over which we are iterating. + const Container *my_map; + // Index in hash table for current item + size_t my_index; + // Pointer to bucket + const bucket* my_bucket; + // Pointer to node that has current item + node* my_node; +}; + +template +bool operator==( const hash_map_iterator& i, const hash_map_iterator& j ) { + return i.my_node == j.my_node && i.my_map == j.my_map; +} + +template +bool operator!=( const hash_map_iterator& i, const hash_map_iterator& j ) { + return i.my_node != j.my_node || i.my_map != j.my_map; +} + +// Range class used with concurrent_hash_map +template +class hash_map_range { + using map_type = typename Iterator::map_type; +public: + // Type for size of a range + using size_type = std::size_t; + using value_type = typename Iterator::value_type; + using reference = typename Iterator::reference; + using difference_type = typename Iterator::difference_type; + using iterator = Iterator; + + // True if range is empty. + bool empty() const { return my_begin == my_end; } + + // True if range can be partitioned into two subranges. + bool is_divisible() const { + return my_midpoint != my_end; + } + + // Split range. + hash_map_range( hash_map_range& r, split ) : + my_end(r.my_end), + my_grainsize(r.my_grainsize) + { + r.my_end = my_begin = r.my_midpoint; + __TBB_ASSERT( !empty(), "Splitting despite the range is not divisible" ); + __TBB_ASSERT( !r.empty(), "Splitting despite the range is not divisible" ); + set_midpoint(); + r.set_midpoint(); + } + + // Init range with container and grainsize specified + hash_map_range( const map_type &map, size_type grainsize_ = 1 ) : + my_begin( Iterator( map, 0, map.my_embedded_segment, map.my_embedded_segment->node_list.load(std::memory_order_relaxed) ) ), + my_end( Iterator( map, map.my_mask.load(std::memory_order_relaxed) + 1, nullptr, nullptr ) ), + my_grainsize( grainsize_ ) + { + __TBB_ASSERT( grainsize_>0, "grainsize must be positive" ); + set_midpoint(); + } + + Iterator begin() const { return my_begin; } + Iterator end() const { return my_end; } + // The grain size for this range. + size_type grainsize() const { return my_grainsize; } + +private: + Iterator my_begin; + Iterator my_end; + mutable Iterator my_midpoint; + size_t my_grainsize; + // Set my_midpoint to point approximately half way between my_begin and my_end. + void set_midpoint() const; + template friend class hash_map_range; +}; + +template +void hash_map_range::set_midpoint() const { + // Split by groups of nodes + size_t m = my_end.my_index-my_begin.my_index; + if( m > my_grainsize ) { + m = my_begin.my_index + m/2u; + auto b = my_begin.my_map->get_bucket(m); + my_midpoint = Iterator(*my_begin.my_map,m,b,b->node_list.load(std::memory_order_relaxed)); + } else { + my_midpoint = my_end; + } + __TBB_ASSERT( my_begin.my_index <= my_midpoint.my_index, + "my_begin is after my_midpoint" ); + __TBB_ASSERT( my_midpoint.my_index <= my_end.my_index, + "my_midpoint is after my_end" ); + __TBB_ASSERT( my_begin != my_midpoint || my_begin == my_end, + "[my_begin, my_midpoint) range should not be empty" ); +} + +template , + typename Allocator = tbb_allocator> +#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS + , typename MutexType = spin_rw_mutex + > + __TBB_requires(tbb::detail::hash_compare && + ch_map_rw_scoped_lockable) +#else + > + __TBB_requires(tbb::detail::hash_compare) +#endif +class concurrent_hash_map +#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS + : protected hash_map_base +#else + : protected hash_map_base +#endif +{ + template + friend class hash_map_iterator; + + template + friend class hash_map_range; + using allocator_traits_type = tbb::detail::allocator_traits; + +#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS + using base_type = hash_map_base; +#else + using base_type = hash_map_base; +#endif +public: + using key_type = Key; + using mapped_type = T; + // type_identity is needed to disable implicit deduction guides for std::initializer_list constructors + // and copy/move constructor with explicit allocator argument + using allocator_type = tbb::detail::type_identity_t; + using hash_compare_type = tbb::detail::type_identity_t; + using value_type = std::pair; + using size_type = typename base_type::size_type; + using difference_type = std::ptrdiff_t; +#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS + using mutex_type = MutexType; +#endif + using pointer = typename allocator_traits_type::pointer; + using const_pointer = typename allocator_traits_type::const_pointer; + + using reference = value_type&; + using const_reference = const value_type&; + using iterator = hash_map_iterator; + using const_iterator = hash_map_iterator; + using range_type = hash_map_range; + using const_range_type = hash_map_range; + +protected: + static_assert(std::is_same::value, + "value_type of the container must be the same as its allocator's"); + + friend class const_accessor; + class node; + using segment_index_type = typename base_type::segment_index_type; + using segment_ptr_type = typename base_type::segment_ptr_type; + using node_base = typename base_type::node_base; + using bucket = typename base_type::bucket; + using hashcode_type = typename base_type::hashcode_type; + using bucket_allocator_type = typename base_type::bucket_allocator_type; + using node_allocator_type = typename base_type::allocator_traits_type::template rebind_alloc; + using node_allocator_traits = tbb::detail::allocator_traits; + hash_compare_type my_hash_compare; + + class node : public node_base { + public: + node() {} + ~node() {} + pointer storage() { return &my_value; } + value_type& value() { return *storage(); } + private: + union { + value_type my_value; + }; + }; + + void delete_node( node_base *n ) { + node_allocator_type node_allocator(this->get_allocator()); + node_allocator_traits::destroy(node_allocator, static_cast(n)->storage()); + node_allocator_traits::destroy(node_allocator, static_cast(n)); + node_allocator_traits::deallocate(node_allocator, static_cast(n), 1); + } + + template + static node* create_node(bucket_allocator_type& allocator, Args&&... args) { + node_allocator_type node_allocator(allocator); + node* node_ptr = node_allocator_traits::allocate(node_allocator, 1); + auto guard = make_raii_guard([&] { + node_allocator_traits::destroy(node_allocator, node_ptr); + node_allocator_traits::deallocate(node_allocator, node_ptr, 1); + }); + + node_allocator_traits::construct(node_allocator, node_ptr); + node_allocator_traits::construct(node_allocator, node_ptr->storage(), std::forward(args)...); + guard.dismiss(); + return node_ptr; + } + + static node* allocate_node_copy_construct(bucket_allocator_type& allocator, const Key &key, const T * t){ + return create_node(allocator, key, *t); + } + + static node* allocate_node_move_construct(bucket_allocator_type& allocator, const Key &key, const T * t){ + return create_node(allocator, key, std::move(*const_cast(t))); + } + + template + static node* allocate_node_default_construct(bucket_allocator_type& allocator, const K &key, const T * ){ + // Emplace construct an empty T object inside the pair + return create_node(allocator, std::piecewise_construct, + std::forward_as_tuple(key), std::forward_as_tuple()); + } + + static node* do_not_allocate_node(bucket_allocator_type& , const Key &, const T * ){ + __TBB_ASSERT(false,"this dummy function should not be called"); + return nullptr; + } + + template + node *search_bucket( const K &key, bucket *b ) const { + node *n = static_cast( b->node_list.load(std::memory_order_relaxed) ); + while (this->is_valid(n) && !my_hash_compare.equal(key, n->value().first)) + n = static_cast( n->next ); + __TBB_ASSERT(!rehash_required(n), "Search can be executed only for rehashed bucket"); + return n; + } + + // bucket accessor is to find, rehash, acquire a lock, and access a bucket + class bucket_accessor : public bucket::scoped_type { + bucket *my_b; + public: + bucket_accessor( concurrent_hash_map *base, const hashcode_type h, bool writer = false ) { acquire( base, h, writer ); } + // find a bucket by masked hashcode, optionally rehash, and acquire the lock + inline void acquire( concurrent_hash_map *base, const hashcode_type h, bool writer = false ) { + my_b = base->get_bucket( h ); + // TODO: actually, notification is unnecessary here, just hiding double-check + if (rehash_required(my_b->node_list.load(std::memory_order_acquire)) + && bucket::scoped_type::try_acquire( my_b->mutex, /*write=*/true ) ) + { + if (rehash_required(my_b->node_list.load(std::memory_order_relaxed))) base->rehash_bucket(my_b, h); // recursive rehashing + } + else bucket::scoped_type::acquire( my_b->mutex, writer ); + __TBB_ASSERT(!rehash_required(my_b->node_list.load(std::memory_order_relaxed)), nullptr); + } + + // get bucket pointer + bucket *operator() () { return my_b; } + }; + + // TODO refactor to hash_base + void rehash_bucket( bucket *b_new, const hashcode_type hash ) { + __TBB_ASSERT( hash > 1, "The lowermost buckets can't be rehashed" ); + b_new->node_list.store(reinterpret_cast(empty_rehashed_flag), std::memory_order_release); // mark rehashed + hashcode_type mask = (hashcode_type(1) << tbb::detail::log2(hash)) - 1; // get parent mask from the topmost bit + bucket_accessor b_old( this, hash & mask ); + + mask = (mask<<1) | 1; // get full mask for new bucket + __TBB_ASSERT( (mask&(mask+1))==0 && (hash & mask) == hash, nullptr ); + restart: + node_base* prev = nullptr; + node_base* curr = b_old()->node_list.load(std::memory_order_acquire); + while (this->is_valid(curr)) { + hashcode_type curr_node_hash = my_hash_compare.hash(static_cast(curr)->value().first); + + if ((curr_node_hash & mask) == hash) { + if (!b_old.is_writer()) { + if (!b_old.upgrade_to_writer()) { + goto restart; // node ptr can be invalid due to concurrent erase + } + } + node_base* next = curr->next; + // exclude from b_old + if (prev == nullptr) { + b_old()->node_list.store(curr->next, std::memory_order_relaxed); + } else { + prev->next = curr->next; + } + this->add_to_bucket(b_new, curr); + curr = next; + } else { + prev = curr; + curr = curr->next; + } + } + } + + template + using hash_compare_is_transparent = dependent_bool, U>; + +public: + + class accessor; + // Combines data access, locking, and garbage collection. + class const_accessor : private node::scoped_type /*which derived from no_copy*/ { +#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS + friend class concurrent_hash_map; +#else + friend class concurrent_hash_map; +#endif + friend class accessor; + public: + // Type of value + using value_type = const typename concurrent_hash_map::value_type; + + // True if result is empty. + bool empty() const { return !my_node; } + + // Set to null + void release() { + if( my_node ) { + node::scoped_type::release(); + my_node = nullptr; + } + } + + // Return reference to associated value in hash table. + const_reference operator*() const { + __TBB_ASSERT( my_node, "attempt to dereference empty accessor" ); + return my_node->value(); + } + + // Return pointer to associated value in hash table. + const_pointer operator->() const { + return &operator*(); + } + + // Create empty result + const_accessor() : my_node(nullptr), my_hash() {} + + // Destroy result after releasing the underlying reference. + ~const_accessor() { + my_node = nullptr; // scoped lock's release() is called in its destructor + } + protected: + bool is_writer() { return node::scoped_type::is_writer(); } + node *my_node; + hashcode_type my_hash; + }; + + // Allows write access to elements and combines data access, locking, and garbage collection. + class accessor: public const_accessor { + public: + // Type of value + using value_type = typename concurrent_hash_map::value_type; + + // Return reference to associated value in hash table. + reference operator*() const { + __TBB_ASSERT( this->my_node, "attempt to dereference empty accessor" ); + return this->my_node->value(); + } + + // Return pointer to associated value in hash table. + pointer operator->() const { + return &operator*(); + } + }; + + explicit concurrent_hash_map( const hash_compare_type& compare, const allocator_type& a = allocator_type() ) + : base_type(a) + , my_hash_compare(compare) + {} + + concurrent_hash_map() : concurrent_hash_map(hash_compare_type()) {} + + explicit concurrent_hash_map( const allocator_type& a ) + : concurrent_hash_map(hash_compare_type(), a) + {} + + // Construct empty table with n preallocated buckets. This number serves also as initial concurrency level. + concurrent_hash_map( size_type n, const allocator_type &a = allocator_type() ) + : concurrent_hash_map(a) + { + this->reserve(n); + } + + concurrent_hash_map( size_type n, const hash_compare_type& compare, const allocator_type& a = allocator_type() ) + : concurrent_hash_map(compare, a) + { + this->reserve(n); + } + + // Copy constructor + concurrent_hash_map( const concurrent_hash_map &table ) + : concurrent_hash_map(node_allocator_traits::select_on_container_copy_construction(table.get_allocator())) + { + try_call( [&] { + internal_copy(table); + }).on_exception( [&] { + this->clear(); + }); + } + + concurrent_hash_map( const concurrent_hash_map &table, const allocator_type &a) + : concurrent_hash_map(a) + { + try_call( [&] { + internal_copy(table); + }).on_exception( [&] { + this->clear(); + }); + } + + // Move constructor + concurrent_hash_map( concurrent_hash_map &&table ) + : concurrent_hash_map(std::move(table.get_allocator())) + { + this->internal_move(std::move(table)); + } + + // Move constructor + concurrent_hash_map( concurrent_hash_map &&table, const allocator_type &a ) + : concurrent_hash_map(a) + { + using is_equal_type = typename node_allocator_traits::is_always_equal; + internal_move_construct_with_allocator(std::move(table), a, is_equal_type()); + } + + // Construction with copying iteration range and given allocator instance + template + concurrent_hash_map( I first, I last, const allocator_type &a = allocator_type() ) + : concurrent_hash_map(a) + { + try_call( [&] { + internal_copy(first, last, std::distance(first, last)); + }).on_exception( [&] { + this->clear(); + }); + } + + template + concurrent_hash_map( I first, I last, const hash_compare_type& compare, const allocator_type& a = allocator_type() ) + : concurrent_hash_map(compare, a) + { + try_call( [&] { + internal_copy(first, last, std::distance(first, last)); + }).on_exception( [&] { + this->clear(); + }); + } + + concurrent_hash_map( std::initializer_list il, const hash_compare_type& compare = hash_compare_type(), const allocator_type& a = allocator_type() ) + : concurrent_hash_map(compare, a) + { + try_call( [&] { + internal_copy(il.begin(), il.end(), il.size()); + }).on_exception( [&] { + this->clear(); + }); + } + + concurrent_hash_map( std::initializer_list il, const allocator_type& a ) + : concurrent_hash_map(il, hash_compare_type(), a) {} + + // Assignment + concurrent_hash_map& operator=( const concurrent_hash_map &table ) { + if( this != &table ) { + clear(); + copy_assign_allocators(this->my_allocator, table.my_allocator); + internal_copy(table); + } + return *this; + } + + // Move Assignment + concurrent_hash_map& operator=( concurrent_hash_map &&table ) { + if( this != &table ) { + using pocma_type = typename node_allocator_traits::propagate_on_container_move_assignment; + using is_equal_type = typename node_allocator_traits::is_always_equal; + move_assign_allocators(this->my_allocator, table.my_allocator); + internal_move_assign(std::move(table), tbb::detail::disjunction()); + } + return *this; + } + + // Assignment + concurrent_hash_map& operator=( std::initializer_list il ) { + clear(); + internal_copy(il.begin(), il.end(), il.size()); + return *this; + } + + // Rehashes and optionally resizes the whole table. + /** Useful to optimize performance before or after concurrent operations. + Also enables using of find() and count() concurrent methods in serial context. */ + void rehash(size_type sz = 0) { + this->reserve(sz); // TODO: add reduction of number of buckets as well + hashcode_type mask = this->my_mask.load(std::memory_order_relaxed); + hashcode_type b = (mask+1)>>1; // size or first index of the last segment + __TBB_ASSERT((b&(b-1))==0, nullptr); // zero or power of 2 + bucket *bp = this->get_bucket( b ); // only the last segment should be scanned for rehashing + for(; b <= mask; b++, bp++ ) { + node_base *n = bp->node_list.load(std::memory_order_relaxed); + __TBB_ASSERT( this->is_valid(n) || empty_rehashed(n) || rehash_required(n), "Broken internal structure" ); + __TBB_ASSERT( *reinterpret_cast(&bp->mutex) == 0, "concurrent or unexpectedly terminated operation during rehash() execution" ); + if (rehash_required(n)) { // rehash bucket, conditional because rehashing of a previous bucket may affect this one + hashcode_type h = b; bucket *b_old = bp; + do { + __TBB_ASSERT( h > 1, "The lowermost buckets can't be rehashed" ); + hashcode_type m = ( hashcode_type(1) << tbb::detail::log2( h ) ) - 1; // get parent mask from the topmost bit + b_old = this->get_bucket( h &= m ); + } while( rehash_required(b_old->node_list.load(std::memory_order_relaxed)) ); + // now h - is index of the root rehashed bucket b_old + this->mark_rehashed_levels( h ); // mark all non-rehashed children recursively across all segments + node_base* prev = nullptr; + node_base* curr = b_old->node_list.load(std::memory_order_relaxed); + while (this->is_valid(curr)) { + hashcode_type curr_node_hash = my_hash_compare.hash(static_cast(curr)->value().first); + + if ((curr_node_hash & mask) != h) { // should be rehashed + node_base* next = curr->next; + // exclude from b_old + if (prev == nullptr) { + b_old->node_list.store(curr->next, std::memory_order_relaxed); + } else { + prev->next = curr->next; + } + bucket *b_new = this->get_bucket(curr_node_hash & mask); + __TBB_ASSERT(!rehash_required(b_new->node_list.load(std::memory_order_relaxed)), "hash() function changed for key in table or internal error"); + this->add_to_bucket(b_new, curr); + curr = next; + } else { + prev = curr; + curr = curr->next; + } + } + } + } + } + + // Clear table + void clear() { + hashcode_type m = this->my_mask.load(std::memory_order_relaxed); + __TBB_ASSERT((m&(m+1))==0, "data structure is invalid"); + this->my_size.store(0, std::memory_order_relaxed); + segment_index_type s = this->segment_index_of( m ); + __TBB_ASSERT( s+1 == this->pointers_per_table || !this->my_table[s+1].load(std::memory_order_relaxed), "wrong mask or concurrent grow" ); + do { + __TBB_ASSERT(this->is_valid(this->my_table[s].load(std::memory_order_relaxed)), "wrong mask or concurrent grow" ); + segment_ptr_type buckets_ptr = this->my_table[s].load(std::memory_order_relaxed); + size_type sz = this->segment_size( s ? s : 1 ); + for( segment_index_type i = 0; i < sz; i++ ) + for( node_base *n = buckets_ptr[i].node_list.load(std::memory_order_relaxed); + this->is_valid(n); n = buckets_ptr[i].node_list.load(std::memory_order_relaxed) ) + { + buckets_ptr[i].node_list.store(n->next, std::memory_order_relaxed); + delete_node( n ); + } + this->delete_segment(s); + } while(s-- > 0); + this->my_mask.store(this->embedded_buckets - 1, std::memory_order_relaxed); + } + + // Clear table and destroy it. + ~concurrent_hash_map() { clear(); } + + //------------------------------------------------------------------------ + // Parallel algorithm support + //------------------------------------------------------------------------ + range_type range( size_type grainsize=1 ) { + return range_type( *this, grainsize ); + } + const_range_type range( size_type grainsize=1 ) const { + return const_range_type( *this, grainsize ); + } + + //------------------------------------------------------------------------ + // STL support - not thread-safe methods + //------------------------------------------------------------------------ + iterator begin() { return iterator( *this, 0, this->my_embedded_segment, this->my_embedded_segment->node_list.load(std::memory_order_relaxed) ); } + const_iterator begin() const { return const_iterator( *this, 0, this->my_embedded_segment, this->my_embedded_segment->node_list.load(std::memory_order_relaxed) ); } + const_iterator cbegin() const { return const_iterator( *this, 0, this->my_embedded_segment, this->my_embedded_segment->node_list.load(std::memory_order_relaxed) ); } + iterator end() { return iterator( *this, 0, nullptr, nullptr ); } + const_iterator end() const { return const_iterator( *this, 0, nullptr, nullptr ); } + const_iterator cend() const { return const_iterator( *this, 0, nullptr, nullptr ); } + std::pair equal_range( const Key& key ) { return internal_equal_range( key, end() ); } + std::pair equal_range( const Key& key ) const { return internal_equal_range( key, end() ); } + + template + typename std::enable_if::value, + std::pair>::type equal_range( const K& key ) { + return internal_equal_range(key, end()); + } + + template + typename std::enable_if::value, + std::pair>::type equal_range( const K& key ) const { + return internal_equal_range(key, end()); + } + + // Number of items in table. + size_type size() const { return this->my_size.load(std::memory_order_acquire); } + + // True if size()==0. + __TBB_nodiscard bool empty() const { return size() == 0; } + + // Upper bound on size. + size_type max_size() const { + return allocator_traits_type::max_size(base_type::get_allocator()); + } + + // Returns the current number of buckets + size_type bucket_count() const { return this->my_mask.load(std::memory_order_relaxed) + 1; } + + // return allocator object + allocator_type get_allocator() const { return base_type::get_allocator(); } + + // swap two instances. Iterators are invalidated + void swap(concurrent_hash_map& table) { + using pocs_type = typename node_allocator_traits::propagate_on_container_swap; + using is_equal_type = typename node_allocator_traits::is_always_equal; + swap_allocators(this->my_allocator, table.my_allocator); + internal_swap(table, tbb::detail::disjunction()); + } + + //------------------------------------------------------------------------ + // concurrent map operations + //------------------------------------------------------------------------ + + // Return count of items (0 or 1) + size_type count( const Key &key ) const { + return const_cast(this)->lookup(key, nullptr, nullptr, /*write=*/false, &do_not_allocate_node); + } + + template + typename std::enable_if::value, + size_type>::type count( const K& key ) const { + return const_cast(this)->lookup(key, nullptr, nullptr, /*write=*/false, &do_not_allocate_node); + } + + // Find item and acquire a read lock on the item. + /** Return true if item is found, false otherwise. */ + bool find( const_accessor &result, const Key &key ) const { + result.release(); + return const_cast(this)->lookup(key, nullptr, &result, /*write=*/false, &do_not_allocate_node ); + } + + // Find item and acquire a write lock on the item. + /** Return true if item is found, false otherwise. */ + bool find( accessor &result, const Key &key ) { + result.release(); + return lookup(key, nullptr, &result, /*write=*/true, &do_not_allocate_node); + } + + template + typename std::enable_if::value, + bool>::type find( const_accessor& result, const K& key ) { + result.release(); + return lookup(key, nullptr, &result, /*write=*/false, &do_not_allocate_node); + } + + template + typename std::enable_if::value, + bool>::type find( accessor& result, const K& key ) { + result.release(); + return lookup(key, nullptr, &result, /*write=*/true, &do_not_allocate_node); + } + + // Insert item (if not already present) and acquire a read lock on the item. + /** Returns true if item is new. */ + bool insert( const_accessor &result, const Key &key ) { + result.release(); + return lookup(key, nullptr, &result, /*write=*/false, &allocate_node_default_construct<>); + } + + // Insert item (if not already present) and acquire a write lock on the item. + /** Returns true if item is new. */ + bool insert( accessor &result, const Key &key ) { + result.release(); + return lookup(key, nullptr, &result, /*write=*/true, &allocate_node_default_construct<>); + } + + template + typename std::enable_if::value && + std::is_constructible::value, + bool>::type insert( const_accessor& result, const K& key ) { + result.release(); + return lookup(key, nullptr, &result, /*write=*/false, &allocate_node_default_construct); + } + + template + typename std::enable_if::value && + std::is_constructible::value, + bool>::type insert( accessor& result, const K& key ) { + result.release(); + return lookup(key, nullptr, &result, /*write=*/true, &allocate_node_default_construct); + } + + // Insert item by copying if there is no such key present already and acquire a read lock on the item. + /** Returns true if item is new. */ + bool insert( const_accessor &result, const value_type &value ) { + result.release(); + return lookup(value.first, &value.second, &result, /*write=*/false, &allocate_node_copy_construct); + } + + // Insert item by copying if there is no such key present already and acquire a write lock on the item. + /** Returns true if item is new. */ + bool insert( accessor &result, const value_type &value ) { + result.release(); + return lookup(value.first, &value.second, &result, /*write=*/true, &allocate_node_copy_construct); + } + + // Insert item by copying if there is no such key present already + /** Returns true if item is inserted. */ + bool insert( const value_type &value ) { + return lookup(value.first, &value.second, nullptr, /*write=*/false, &allocate_node_copy_construct); + } + + // Insert item by copying if there is no such key present already and acquire a read lock on the item. + /** Returns true if item is new. */ + bool insert( const_accessor &result, value_type && value ) { + return generic_move_insert(result, std::move(value)); + } + + // Insert item by copying if there is no such key present already and acquire a write lock on the item. + /** Returns true if item is new. */ + bool insert( accessor &result, value_type && value ) { + return generic_move_insert(result, std::move(value)); + } + + // Insert item by copying if there is no such key present already + /** Returns true if item is inserted. */ + bool insert( value_type && value ) { + return generic_move_insert(accessor_not_used(), std::move(value)); + } + + // Insert item by copying if there is no such key present already and acquire a read lock on the item. + /** Returns true if item is new. */ + template + bool emplace( const_accessor &result, Args&&... args ) { + return generic_emplace(result, std::forward(args)...); + } + + // Insert item by copying if there is no such key present already and acquire a write lock on the item. + /** Returns true if item is new. */ + template + bool emplace( accessor &result, Args&&... args ) { + return generic_emplace(result, std::forward(args)...); + } + + // Insert item by copying if there is no such key present already + /** Returns true if item is inserted. */ + template + bool emplace( Args&&... args ) { + return generic_emplace(accessor_not_used(), std::forward(args)...); + } + + // Insert range [first, last) + template + void insert( I first, I last ) { + for ( ; first != last; ++first ) + insert( *first ); + } + + // Insert initializer list + void insert( std::initializer_list il ) { + insert( il.begin(), il.end() ); + } + + // Erase item. + /** Return true if item was erased by particularly this call. */ + bool erase( const Key &key ) { + return internal_erase(key); + } + + template + typename std::enable_if::value, + bool>::type erase( const K& key ) { + return internal_erase(key); + } + + // Erase item by const_accessor. + /** Return true if item was erased by particularly this call. */ + bool erase( const_accessor& item_accessor ) { + return exclude( item_accessor ); + } + + // Erase item by accessor. + /** Return true if item was erased by particularly this call. */ + bool erase( accessor& item_accessor ) { + return exclude( item_accessor ); + } + +protected: + template + node* allocate_node_helper( const K& key, const T* t, AllocateNodeType allocate_node, std::true_type ) { + return allocate_node(base_type::get_allocator(), key, t); + } + + template + node* allocate_node_helper( const K&, const T*, AllocateNodeType, std::false_type ) { + __TBB_ASSERT(false, "allocate_node_helper with std::false_type should never been called"); + return nullptr; + } + + // Insert or find item and optionally acquire a lock on the item. + template + bool lookup( const K &key, const T *t, const_accessor *result, bool write, AllocateNodeType allocate_node, node *tmp_n = nullptr) + { + __TBB_ASSERT( !result || !result->my_node, nullptr ); + bool return_value; + hashcode_type const h = my_hash_compare.hash( key ); + hashcode_type m = this->my_mask.load(std::memory_order_acquire); + segment_index_type grow_segment = 0; + node *n; + restart: + {//lock scope + __TBB_ASSERT((m&(m+1))==0, "data structure is invalid"); + return_value = false; + // get bucket + bucket_accessor b( this, h & m ); + // find a node + n = search_bucket( key, b() ); + if( OpInsert ) { + // [opt] insert a key + if( !n ) { + if( !tmp_n ) { + tmp_n = allocate_node_helper(key, t, allocate_node, std::integral_constant{}); + } + while ( !b.is_writer() && !b.upgrade_to_writer() ) { // TODO: improved insertion + // Rerun search list, in case another thread inserted the intem during the upgrade + n = search_bucket(key, b()); + if (this->is_valid(n)) { // unfortunately, it did + if (!b.downgrade_to_reader()) { + // If the lock was downgraded with reacquiring the mutex + // Rerun search list in case another thread removed the item during the downgrade + n = search_bucket(key, b()); + if (!this->is_valid(n)) { + // Unfortunately, it did + // We need to try upgrading to writer again + continue; + } + } + goto exists; + } + } + + if( this->check_mask_race(h, m) ) + goto restart; // b.release() is done in ~b(). + // insert and set flag to grow the container + grow_segment = this->insert_new_node( b(), n = tmp_n, m ); + tmp_n = nullptr; + return_value = true; + } + } else { // find or count + if( !n ) { + if( this->check_mask_race( h, m ) ) + goto restart; // b.release() is done in ~b(). TODO: replace by continue + return false; + } + return_value = true; + } + exists: + if( !result ) goto check_growth; + // TODO: the following seems as generic/regular operation + // acquire the item + if( !result->try_acquire( n->mutex, write ) ) { + for( tbb::detail::atomic_backoff backoff(true);; ) { + if( result->try_acquire( n->mutex, write ) ) break; + if( !backoff.bounded_pause() ) { + // the wait takes really long, restart the operation + b.release(); + __TBB_ASSERT( !OpInsert || !return_value, "Can't acquire new item in locked bucket?" ); + yield(); + m = this->my_mask.load(std::memory_order_acquire); + goto restart; + } + } + } + }//lock scope + result->my_node = n; + result->my_hash = h; + check_growth: + // [opt] grow the container + if( grow_segment ) { + this->enable_segment( grow_segment ); + } + if( tmp_n ) // if OpInsert only + delete_node( tmp_n ); + return return_value; + } + + struct accessor_not_used { void release(){}}; + friend const_accessor* accessor_location( accessor_not_used const& ){ return nullptr;} + friend const_accessor* accessor_location( const_accessor & a ) { return &a;} + + friend bool is_write_access_needed( accessor const& ) { return true;} + friend bool is_write_access_needed( const_accessor const& ) { return false;} + friend bool is_write_access_needed( accessor_not_used const& ) { return false;} + + template + bool generic_move_insert( Accessor && result, value_type && value ) { + result.release(); + return lookup(value.first, &value.second, accessor_location(result), is_write_access_needed(result), &allocate_node_move_construct); + } + + template + bool generic_emplace( Accessor && result, Args &&... args ) { + result.release(); + node * node_ptr = create_node(base_type::get_allocator(), std::forward(args)...); + return lookup(node_ptr->value().first, nullptr, accessor_location(result), is_write_access_needed(result), &do_not_allocate_node, node_ptr); + } + + // delete item by accessor + bool exclude( const_accessor &item_accessor ) { + __TBB_ASSERT( item_accessor.my_node, nullptr ); + node_base *const exclude_node = item_accessor.my_node; + hashcode_type const hash = item_accessor.my_hash; + hashcode_type mask = this->my_mask.load(std::memory_order_acquire); + do { + // get bucket + bucket_accessor b( this, hash & mask, /*writer=*/true ); + node_base* prev = nullptr; + node_base* curr = b()->node_list.load(std::memory_order_relaxed); + + while (curr && curr != exclude_node) { + prev = curr; + curr = curr->next; + } + + if (curr == nullptr) { // someone else was first + if (this->check_mask_race(hash, mask)) + continue; + item_accessor.release(); + return false; + } + __TBB_ASSERT( curr == exclude_node, nullptr ); + // remove from container + if (prev == nullptr) { + b()->node_list.store(curr->next, std::memory_order_relaxed); + } else { + prev->next = curr->next; + } + + this->my_size--; + break; + } while(true); + if (!item_accessor.is_writer()) { // need to get exclusive lock + item_accessor.upgrade_to_writer(); // return value means nothing here + } + + item_accessor.release(); + delete_node(exclude_node); // Only one thread can delete it + return true; + } + + template + bool internal_erase( const K& key ) { + node_base *erase_node; + hashcode_type const hash = my_hash_compare.hash(key); + hashcode_type mask = this->my_mask.load(std::memory_order_acquire); + restart: + {//lock scope + // get bucket + bucket_accessor b( this, hash & mask ); + search: + node_base* prev = nullptr; + erase_node = b()->node_list.load(std::memory_order_relaxed); + while (this->is_valid(erase_node) && !my_hash_compare.equal(key, static_cast(erase_node)->value().first ) ) { + prev = erase_node; + erase_node = erase_node->next; + } + + if (erase_node == nullptr) { // not found, but mask could be changed + if (this->check_mask_race(hash, mask)) + goto restart; + return false; + } else if (!b.is_writer() && !b.upgrade_to_writer()) { + if (this->check_mask_race(hash, mask)) // contended upgrade, check mask + goto restart; + goto search; + } + + // remove from container + if (prev == nullptr) { + b()->node_list.store(erase_node->next, std::memory_order_relaxed); + } else { + prev->next = erase_node->next; + } + this->my_size--; + } + { + typename node::scoped_type item_locker( erase_node->mutex, /*write=*/true ); + } + // note: there should be no threads pretending to acquire this mutex again, do not try to upgrade const_accessor! + delete_node(erase_node); // Only one thread can delete it due to write lock on the bucket + return true; + } + + // Returns an iterator for an item defined by the key, or for the next item after it (if upper==true) + template + std::pair internal_equal_range( const K& key, I end_ ) const { + hashcode_type h = my_hash_compare.hash( key ); + hashcode_type m = this->my_mask.load(std::memory_order_relaxed); + __TBB_ASSERT((m&(m+1))==0, "data structure is invalid"); + h &= m; + bucket *b = this->get_bucket( h ); + while (rehash_required(b->node_list.load(std::memory_order_relaxed))) { + m = ( hashcode_type(1) << tbb::detail::log2( h ) ) - 1; // get parent mask from the topmost bit + b = this->get_bucket( h &= m ); + } + node *n = search_bucket( key, b ); + if( !n ) + return std::make_pair(end_, end_); + iterator lower(*this, h, b, n), upper(lower); + return std::make_pair(lower, ++upper); + } + + // Copy "source" to *this, where *this must start out empty. + void internal_copy( const concurrent_hash_map& source ) { + hashcode_type mask = source.my_mask.load(std::memory_order_relaxed); + if( this->my_mask.load(std::memory_order_relaxed) == mask ) { // optimized version + this->reserve(source.my_size.load(std::memory_order_relaxed)); // TODO: load_factor? + bucket *dst = nullptr, *src = nullptr; + bool rehashing_required = false; + for( hashcode_type k = 0; k <= mask; k++ ) { + if( k & (k-2) ) ++dst,src++; // not the beginning of a segment + else { dst = this->get_bucket( k ); src = source.get_bucket( k ); } + __TBB_ASSERT(!rehash_required(dst->node_list.load(std::memory_order_relaxed)), "Invalid bucket in destination table"); + node *n = static_cast( src->node_list.load(std::memory_order_relaxed) ); + if (rehash_required(n)) { // source is not rehashed, items are in previous buckets + rehashing_required = true; + dst->node_list.store(reinterpret_cast(rehash_req_flag), std::memory_order_relaxed); + } else for(; n; n = static_cast( n->next ) ) { + node* node_ptr = create_node(base_type::get_allocator(), n->value().first, n->value().second); + this->add_to_bucket( dst, node_ptr); + this->my_size.fetch_add(1, std::memory_order_relaxed); + } + } + if( rehashing_required ) rehash(); + } else internal_copy(source.begin(), source.end(), source.my_size.load(std::memory_order_relaxed)); + } + + template + void internal_copy( I first, I last, size_type reserve_size ) { + this->reserve(reserve_size); // TODO: load_factor? + hashcode_type m = this->my_mask.load(std::memory_order_relaxed); + for(; first != last; ++first) { + hashcode_type h = my_hash_compare.hash( (*first).first ); + bucket *b = this->get_bucket( h & m ); + __TBB_ASSERT(!rehash_required(b->node_list.load(std::memory_order_relaxed)), "Invalid bucket in destination table"); + node* node_ptr = create_node(base_type::get_allocator(), (*first).first, (*first).second); + this->add_to_bucket( b, node_ptr ); + ++this->my_size; // TODO: replace by non-atomic op + } + } + + void internal_move_construct_with_allocator( concurrent_hash_map&& other, const allocator_type&, + /*is_always_equal=*/std::true_type ) + { + this->internal_move(std::move(other)); + } + + void internal_move_construct_with_allocator( concurrent_hash_map&& other, const allocator_type& a, + /*is_always_equal=*/std::false_type ) + { + if (a == other.get_allocator()){ + this->internal_move(std::move(other)); + } else { + try_call( [&] { + internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end()), + other.size()); + }).on_exception( [&] { + this->clear(); + }); + } + } + + void internal_move_assign( concurrent_hash_map&& other, + /*is_always_equal || POCMA = */std::true_type) + { + this->internal_move(std::move(other)); + } + + void internal_move_assign(concurrent_hash_map&& other, /*is_always_equal=*/ std::false_type) { + if (this->my_allocator == other.my_allocator) { + this->internal_move(std::move(other)); + } else { + //do per element move + internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end()), + other.size()); + } + } + + void internal_swap(concurrent_hash_map& other, /*is_always_equal || POCS = */ std::true_type) { + this->internal_swap_content(other); + } + + void internal_swap(concurrent_hash_map& other, /*is_always_equal || POCS = */ std::false_type) { + __TBB_ASSERT(this->my_allocator == other.my_allocator, nullptr); + this->internal_swap_content(other); + } + + // Fast find when no concurrent erasure is used. For internal use inside TBB only! + /** Return pointer to item with given key, or nullptr if no such item exists. + Must not be called concurrently with erasure operations. */ + const_pointer internal_fast_find( const Key& key ) const { + hashcode_type h = my_hash_compare.hash( key ); + hashcode_type m = this->my_mask.load(std::memory_order_acquire); + node *n; + restart: + __TBB_ASSERT((m&(m+1))==0, "data structure is invalid"); + bucket *b = this->get_bucket( h & m ); + // TODO: actually, notification is unnecessary here, just hiding double-check + if (rehash_required(b->node_list.load(std::memory_order_acquire))) + { + typename bucket::scoped_type lock; + if( lock.try_acquire( b->mutex, /*write=*/true ) ) { + if (rehash_required(b->node_list.load(std::memory_order_relaxed))) + const_cast(this)->rehash_bucket( b, h & m ); //recursive rehashing + } + else lock.acquire( b->mutex, /*write=*/false ); + __TBB_ASSERT(!rehash_required(b->node_list.load(std::memory_order_relaxed)), nullptr); + } + n = search_bucket( key, b ); + if( n ) + return n->storage(); + else if( this->check_mask_race( h, m ) ) + goto restart; + return nullptr; + } +}; + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT +template >, + typename Alloc = tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_hash_map( It, It, HashCompare = HashCompare(), Alloc = Alloc() ) +-> concurrent_hash_map, iterator_mapped_t, HashCompare, Alloc>; + +template >, + typename = std::enable_if_t>> +concurrent_hash_map( It, It, Alloc ) +-> concurrent_hash_map, iterator_mapped_t, d1::tbb_hash_compare>, Alloc>; + +template >, + typename Alloc = tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_hash_map( std::initializer_list>, HashCompare = HashCompare(), Alloc = Alloc() ) +-> concurrent_hash_map, T, HashCompare, Alloc>; + +template >> +concurrent_hash_map( std::initializer_list>, Alloc ) +-> concurrent_hash_map, T, d1::tbb_hash_compare>, Alloc>; + +#endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */ + +template +inline bool operator==(const concurrent_hash_map &a, const concurrent_hash_map &b) { + if(a.size() != b.size()) return false; + typename concurrent_hash_map::const_iterator i(a.begin()), i_end(a.end()); + typename concurrent_hash_map::const_iterator j, j_end(b.end()); + for(; i != i_end; ++i) { + j = b.equal_range(i->first).first; + if( j == j_end || !(i->second == j->second) ) return false; + } + return true; +} + +#if !__TBB_CPP20_COMPARISONS_PRESENT +template +inline bool operator!=(const concurrent_hash_map &a, const concurrent_hash_map &b) +{ return !(a == b); } +#endif // !__TBB_CPP20_COMPARISONS_PRESENT + +template +inline void swap(concurrent_hash_map &a, concurrent_hash_map &b) +{ a.swap( b ); } + +} // namespace d2 +} // namespace detail + +inline namespace v1 { + using detail::split; + using detail::d2::concurrent_hash_map; + using detail::d1::tbb_hash_compare; +} // namespace v1 + +} // namespace tbb + +#endif /* __TBB_concurrent_hash_map_H */ diff --git a/third_party/tbb/concurrent_lru_cache.h b/third_party/tbb/concurrent_lru_cache.h new file mode 100644 index 000000000..d8fe096b4 --- /dev/null +++ b/third_party/tbb/concurrent_lru_cache.h @@ -0,0 +1,375 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_concurrent_lru_cache_H +#define __TBB_concurrent_lru_cache_H + +#if ! TBB_PREVIEW_CONCURRENT_LRU_CACHE + #error Set TBB_PREVIEW_CONCURRENT_LRU_CACHE to include concurrent_lru_cache.h +#endif + +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_aggregator.h" + +#include "third_party/libcxx/map" // for std::map +#include "third_party/libcxx/list" // for std::list +#include "third_party/libcxx/utility" // for std::make_pair +#include "third_party/libcxx/algorithm" // for std::find +#include "third_party/libcxx/atomic" // for std::atomic + +namespace tbb { + +namespace detail { +namespace d1 { + +//----------------------------------------------------------------------------- +// Concurrent LRU cache +//----------------------------------------------------------------------------- + +template +class concurrent_lru_cache : no_assign { +// incapsulated helper classes +private: + struct handle_object; + struct storage_map_value_type; + + struct aggregator_operation; + struct retrieve_aggregator_operation; + struct signal_end_of_usage_aggregator_operation; + +// typedefs +public: + using key_type = KeyT; + using value_type = ValT; + using pointer = ValT*; + using reference = ValT&; + using const_pointer = const ValT*; + using const_reference = const ValT&; + + using value_function_type = KeyToValFunctorT; + using handle = handle_object; +private: + using lru_cache_type = concurrent_lru_cache; + + using storage_map_type = std::map; + using storage_map_iterator_type = typename storage_map_type::iterator; + using storage_map_pointer_type = typename storage_map_type::pointer; + using storage_map_reference_type = typename storage_map_type::reference; + + using history_list_type = std::list; + using history_list_iterator_type = typename history_list_type::iterator; + + using aggregator_operation_type = aggregator_operation; + using aggregator_function_type = aggregating_functor; + using aggregator_type = aggregator; + + friend class aggregating_functor; + +// fields +private: + value_function_type my_value_function; + aggregator_type my_aggregator; + + storage_map_type my_storage_map; // storage map for used objects + history_list_type my_history_list; // history list for unused objects + const std::size_t my_history_list_capacity; // history list's allowed capacity + +// interface +public: + + concurrent_lru_cache(value_function_type value_function, std::size_t cache_capacity) + : my_value_function(value_function), my_history_list_capacity(cache_capacity) { + my_aggregator.initialize_handler(aggregator_function_type(this)); + } + + handle operator[](key_type key) { + retrieve_aggregator_operation op(key); + my_aggregator.execute(&op); + + if (op.is_new_value_needed()) { + op.result().second.my_value = my_value_function(key); + op.result().second.my_is_ready.store(true, std::memory_order_release); + } else { + spin_wait_while_eq(op.result().second.my_is_ready, false); + } + + return handle(*this, op.result()); + } + +private: + + void handle_operations(aggregator_operation* op_list) { + while (op_list) { + op_list->cast_and_handle(*this); + aggregator_operation* prev_op = op_list; + op_list = op_list->next; + + (prev_op->status).store(1, std::memory_order_release); + } + } + + void signal_end_of_usage(storage_map_reference_type map_record_ref) { + signal_end_of_usage_aggregator_operation op(map_record_ref); + my_aggregator.execute(&op); + } + + void signal_end_of_usage_serial(storage_map_reference_type map_record_ref) { + storage_map_iterator_type map_it = my_storage_map.find(map_record_ref.first); + + __TBB_ASSERT(map_it != my_storage_map.end(), + "cache should not return past-end iterators to outer world"); + __TBB_ASSERT(&(*map_it) == &map_record_ref, + "dangling reference has been returned to outside world: data race?"); + __TBB_ASSERT(std::find(my_history_list.begin(), my_history_list.end(), map_it) == my_history_list.end(), + "object in use should not be in list of unused objects "); + + // if it was the last reference, put it to the LRU history + if (! --(map_it->second.my_ref_counter)) { + // if the LRU history is full, evict the oldest items to get space + if (my_history_list.size() >= my_history_list_capacity) { + if (my_history_list_capacity == 0) { + // Since LRU history capacity is zero, there is no need to keep the element in history + my_storage_map.erase(map_it); + return; + } + std::size_t number_of_elements_to_evict = 1 + my_history_list.size() - my_history_list_capacity; + + for (std::size_t i = 0; i < number_of_elements_to_evict; ++i) { + storage_map_iterator_type map_it_to_evict = my_history_list.back(); + + __TBB_ASSERT(map_it_to_evict->second.my_ref_counter == 0, + "item to be evicted should not have a live references"); + + // TODO: can we use forward_list instead of list? pop_front / insert_after last + my_history_list.pop_back(); + my_storage_map.erase(map_it_to_evict); + } + } + + // TODO: can we use forward_list instead of list? pop_front / insert_after last + my_history_list.push_front(map_it); + map_it->second.my_history_list_iterator = my_history_list.begin(); + } + } + + storage_map_reference_type retrieve_serial(key_type key, bool& is_new_value_needed) { + storage_map_iterator_type map_it = my_storage_map.find(key); + + if (map_it == my_storage_map.end()) { + map_it = my_storage_map.emplace_hint( + map_it, std::piecewise_construct, std::make_tuple(key), std::make_tuple(value_type(), 0, my_history_list.end(), false)); + is_new_value_needed = true; + } else { + history_list_iterator_type list_it = map_it->second.my_history_list_iterator; + if (list_it != my_history_list.end()) { + __TBB_ASSERT(map_it->second.my_ref_counter == 0, + "item to be evicted should not have a live references"); + + // Item is going to be used. Therefore it is not a subject for eviction, + // so we remove it from LRU history. + my_history_list.erase(list_it); + map_it->second.my_history_list_iterator = my_history_list.end(); + } + } + + ++(map_it->second.my_ref_counter); + return *map_it; + } +}; + +//----------------------------------------------------------------------------- +// Value type for storage map in concurrent LRU cache +//----------------------------------------------------------------------------- + +template +struct concurrent_lru_cache::storage_map_value_type { +//typedefs +public: + using ref_counter_type = std::size_t; + +// fields +public: + value_type my_value; + ref_counter_type my_ref_counter; + history_list_iterator_type my_history_list_iterator; + std::atomic my_is_ready; + +// interface +public: + storage_map_value_type( + value_type const& value, ref_counter_type ref_counter, + history_list_iterator_type history_list_iterator, bool is_ready) + : my_value(value), my_ref_counter(ref_counter), + my_history_list_iterator(history_list_iterator), my_is_ready(is_ready) {} +}; + +//----------------------------------------------------------------------------- +// Handle object for operator[] in concurrent LRU cache +//----------------------------------------------------------------------------- + +template +struct concurrent_lru_cache::handle_object { +// fields +private: + lru_cache_type* my_lru_cache_ptr; + storage_map_pointer_type my_map_record_ptr; + +// interface +public: + handle_object() + : my_lru_cache_ptr(nullptr), my_map_record_ptr(nullptr) {} + handle_object(lru_cache_type& lru_cache_ref, storage_map_reference_type map_record_ref) + : my_lru_cache_ptr(&lru_cache_ref), my_map_record_ptr(&map_record_ref) {} + + handle_object(handle_object&) = delete; + void operator=(handle_object&) = delete; + + handle_object(handle_object&& other) + : my_lru_cache_ptr(other.my_lru_cache_ptr), my_map_record_ptr(other.my_map_record_ptr) { + + __TBB_ASSERT( + (other.my_lru_cache_ptr != nullptr && other.my_map_record_ptr != nullptr) || + (other.my_lru_cache_ptr == nullptr && other.my_map_record_ptr == nullptr), + "invalid state of moving object?"); + + other.my_lru_cache_ptr = nullptr; + other.my_map_record_ptr = nullptr; + } + + handle_object& operator=(handle_object&& other) { + __TBB_ASSERT( + (other.my_lru_cache_ptr != nullptr && other.my_map_record_ptr != nullptr) || + (other.my_lru_cache_ptr == nullptr && other.my_map_record_ptr == nullptr), + "invalid state of moving object?"); + + if (my_lru_cache_ptr) + my_lru_cache_ptr->signal_end_of_usage(*my_map_record_ptr); + + my_lru_cache_ptr = other.my_lru_cache_ptr; + my_map_record_ptr = other.my_map_record_ptr; + other.my_lru_cache_ptr = nullptr; + other.my_map_record_ptr = nullptr; + + return *this; + } + + ~handle_object() { + if (my_lru_cache_ptr) + my_lru_cache_ptr->signal_end_of_usage(*my_map_record_ptr); + } + + operator bool() const { + return (my_lru_cache_ptr && my_map_record_ptr); + } + + value_type& value() { + __TBB_ASSERT(my_lru_cache_ptr, "get value from already moved object?"); + __TBB_ASSERT(my_map_record_ptr, "get value from an invalid or already moved object?"); + + return my_map_record_ptr->second.my_value; + } +}; + +//----------------------------------------------------------------------------- +// Aggregator operation for aggregator type in concurrent LRU cache +//----------------------------------------------------------------------------- + +template +struct concurrent_lru_cache::aggregator_operation + : aggregated_operation { +// incapsulated helper classes +public: + enum class op_type { retrieve, signal_end_of_usage }; + +// fields +private: + op_type my_op; + +// interface +public: + aggregator_operation(op_type op) : my_op(op) {} + + // TODO: aggregator_operation can be implemented + // - as a statically typed variant type or CRTP? (static, dependent on the use case) + // - or use pointer to function and apply_visitor (dynamic) + // - or use virtual functions (dynamic) + void cast_and_handle(lru_cache_type& lru_cache_ref) { + if (my_op == op_type::retrieve) + static_cast(this)->handle(lru_cache_ref); + else + static_cast(this)->handle(lru_cache_ref); + } +}; + +template +struct concurrent_lru_cache::retrieve_aggregator_operation + : aggregator_operation, private no_assign { +public: + key_type my_key; + storage_map_pointer_type my_map_record_ptr; + bool my_is_new_value_needed; + +public: + retrieve_aggregator_operation(key_type key) + : aggregator_operation(aggregator_operation::op_type::retrieve), + my_key(key), my_map_record_ptr(nullptr), my_is_new_value_needed(false) {} + + void handle(lru_cache_type& lru_cache_ref) { + my_map_record_ptr = &lru_cache_ref.retrieve_serial(my_key, my_is_new_value_needed); + } + + storage_map_reference_type result() { + __TBB_ASSERT(my_map_record_ptr, "Attempt to call result() before calling handle()"); + return *my_map_record_ptr; + } + + bool is_new_value_needed() { return my_is_new_value_needed; } +}; + +template +struct concurrent_lru_cache::signal_end_of_usage_aggregator_operation + : aggregator_operation, private no_assign { + +private: + storage_map_reference_type my_map_record_ref; + +public: + signal_end_of_usage_aggregator_operation(storage_map_reference_type map_record_ref) + : aggregator_operation(aggregator_operation::op_type::signal_end_of_usage), + my_map_record_ref(map_record_ref) {} + + void handle(lru_cache_type& lru_cache_ref) { + lru_cache_ref.signal_end_of_usage_serial(my_map_record_ref); + } +}; + +// TODO: if we have guarantees that KeyToValFunctorT always have +// ValT as a return type and KeyT as an argument type +// we can deduce template parameters of concurrent_lru_cache +// by pattern matching on KeyToValFunctorT + +} // namespace d1 +} // namespace detail + +inline namespace v1 { + +using detail::d1::concurrent_lru_cache; + +} // inline namespace v1 +} // namespace tbb + +#endif // __TBB_concurrent_lru_cache_H diff --git a/third_party/tbb/concurrent_map.h b/third_party/tbb/concurrent_map.h new file mode 100644 index 000000000..55e2f3568 --- /dev/null +++ b/third_party/tbb/concurrent_map.h @@ -0,0 +1,351 @@ +// clang-format off +/* + Copyright (c) 2019-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_concurrent_map_H +#define __TBB_concurrent_map_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_concurrent_skip_list.h" +#include "third_party/tbb/tbb_allocator.h" +#include "third_party/libcxx/functional" +#include "third_party/libcxx/tuple" +#include "third_party/libcxx/utility" + +namespace tbb { +namespace detail { +namespace d2 { + +template +struct map_traits { + static constexpr std::size_t max_level = RandomGenerator::max_level; + using random_level_generator_type = RandomGenerator; + using key_type = Key; + using mapped_type = Value; + using compare_type = KeyCompare; + using value_type = std::pair; + using reference = value_type&; + using const_reference = const value_type&; + using allocator_type = Allocator; + + static constexpr bool allow_multimapping = AllowMultimapping; + + class value_compare { + public: + bool operator()(const value_type& lhs, const value_type& rhs) const { + return comp(lhs.first, rhs.first); + } + + protected: + value_compare(compare_type c) : comp(c) {} + + friend struct map_traits; + + compare_type comp; + }; + + static value_compare value_comp(compare_type comp) { return value_compare(comp); } + + static const key_type& get_key(const_reference val) { + return val.first; + } +}; // struct map_traits + +template +class concurrent_multimap; + +template , typename Allocator = tbb::tbb_allocator>> +class concurrent_map : public concurrent_skip_list, Allocator, false>> { + using base_type = concurrent_skip_list, Allocator, false>>; +public: + using key_type = Key; + using mapped_type = Value; + using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; + using difference_type = typename base_type::difference_type; + using key_compare = Compare; + using value_compare = typename base_type::value_compare; + using allocator_type = Allocator; + + using reference = typename base_type::reference; + using const_reference = typename base_type::const_reference; + using pointer = typename base_type::pointer; + using const_pointer = typename base_type::const_pointer; + + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + using node_type = typename base_type::node_type; + + // Include constructors of base type + using base_type::base_type; + + // Required for implicit deduction guides + concurrent_map() = default; + concurrent_map( const concurrent_map& ) = default; + concurrent_map( const concurrent_map& other, const allocator_type& alloc ) : base_type(other, alloc) {} + concurrent_map( concurrent_map&& ) = default; + concurrent_map( concurrent_map&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} + // Required to respect the rule of 5 + concurrent_map& operator=( const concurrent_map& ) = default; + concurrent_map& operator=( concurrent_map&& ) = default; + + concurrent_map& operator=( std::initializer_list il ) { + base_type::operator= (il); + return *this; + } + + // Observers + mapped_type& at(const key_type& key) { + iterator it = this->find(key); + + if (it == this->end()) { + throw_exception(exception_id::invalid_key); + } + return it->second; + } + + const mapped_type& at(const key_type& key) const { + return const_cast(this)->at(key); + } + + mapped_type& operator[](const key_type& key) { + iterator it = this->find(key); + + if (it == this->end()) { + it = this->emplace(std::piecewise_construct, std::forward_as_tuple(key), std::tuple<>()).first; + } + return it->second; + } + + mapped_type& operator[](key_type&& key) { + iterator it = this->find(key); + + if (it == this->end()) { + it = this->emplace(std::piecewise_construct, std::forward_as_tuple(std::move(key)), std::tuple<>()).first; + } + return it->second; + } + + using base_type::insert; + + template + typename std::enable_if::value, + std::pair>::type insert( P&& value ) + { + return this->emplace(std::forward

(value)); + } + + template + typename std::enable_if::value, + iterator>::type insert( const_iterator hint, P&& value ) + { + return this->emplace_hint(hint, std::forward

(value)); + } + + template + void merge(concurrent_map& source) { + this->internal_merge(source); + } + + template + void merge(concurrent_map&& source) { + this->internal_merge(std::move(source)); + } + + template + void merge(concurrent_multimap& source) { + this->internal_merge(source); + } + + template + void merge(concurrent_multimap&& source) { + this->internal_merge(std::move(source)); + } +}; // class concurrent_map + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template >, + typename Alloc = tbb::tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_map( It, It, Comp = Comp(), Alloc = Alloc() ) +-> concurrent_map, iterator_mapped_t, Comp, Alloc>; + +template >, + typename Alloc = tbb::tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_map( std::initializer_list>, Comp = Comp(), Alloc = Alloc() ) +-> concurrent_map, T, Comp, Alloc>; + +template >, + typename = std::enable_if_t>> +concurrent_map( It, It, Alloc ) +-> concurrent_map, iterator_mapped_t, + std::less>, Alloc>; + +template >> +concurrent_map( std::initializer_list>, Alloc ) +-> concurrent_map, T, std::less>, Alloc>; + +#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template +void swap( concurrent_map& lhs, + concurrent_map& rhs ) +{ + lhs.swap(rhs); +} + +template , typename Allocator = tbb::tbb_allocator>> +class concurrent_multimap : public concurrent_skip_list, Allocator, true>> { + using base_type = concurrent_skip_list, Allocator, true>>; +public: + using key_type = Key; + using mapped_type = Value; + using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; + using difference_type = typename base_type::difference_type; + using key_compare = Compare; + using value_compare = typename base_type::value_compare; + using allocator_type = Allocator; + + using reference = typename base_type::reference; + using const_reference = typename base_type::const_reference; + using pointer = typename base_type::pointer; + using const_pointer = typename base_type::const_pointer; + + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + using node_type = typename base_type::node_type; + + // Include constructors of base_type + using base_type::base_type; + using base_type::insert; + + // Required for implicit deduction guides + concurrent_multimap() = default; + concurrent_multimap( const concurrent_multimap& ) = default; + concurrent_multimap( const concurrent_multimap& other, const allocator_type& alloc ) : base_type(other, alloc) {} + concurrent_multimap( concurrent_multimap&& ) = default; + concurrent_multimap( concurrent_multimap&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} + // Required to respect the rule of 5 + concurrent_multimap& operator=( const concurrent_multimap& ) = default; + concurrent_multimap& operator=( concurrent_multimap&& ) = default; + + concurrent_multimap& operator=( std::initializer_list il ) { + base_type::operator= (il); + return *this; + } + + template + typename std::enable_if::value, + std::pair>::type insert( P&& value ) + { + return this->emplace(std::forward

(value)); + } + + template + typename std::enable_if::value, + iterator>::type insert( const_iterator hint, P&& value ) + { + return this->emplace_hint(hint, std::forward

(value)); + } + + template + void merge(concurrent_multimap& source) { + this->internal_merge(source); + } + + template + void merge(concurrent_multimap&& source) { + this->internal_merge(std::move(source)); + } + + template + void merge(concurrent_map& source) { + this->internal_merge(source); + } + + template + void merge(concurrent_map&& source) { + this->internal_merge(std::move(source)); + } +}; // class concurrent_multimap + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template >, + typename Alloc = tbb::tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_multimap( It, It, Comp = Comp(), Alloc = Alloc() ) +-> concurrent_multimap, iterator_mapped_t, Comp, Alloc>; + +template >, + typename Alloc = tbb::tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_multimap( std::initializer_list>, Comp = Comp(), Alloc = Alloc() ) +-> concurrent_multimap, T, Comp, Alloc>; + +template >, + typename = std::enable_if_t>> +concurrent_multimap( It, It, Alloc ) +-> concurrent_multimap, iterator_mapped_t, + std::less>, Alloc>; + +template >> +concurrent_multimap( std::initializer_list>, Alloc ) +-> concurrent_multimap, T, std::less>, Alloc>; + + +#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template +void swap( concurrent_multimap& lhs, + concurrent_multimap& rhs ) +{ + lhs.swap(rhs); +} + +} // namespace d2 +} // namespace detail + +inline namespace v1 { + +using detail::d2::concurrent_map; +using detail::d2::concurrent_multimap; +using detail::split; + +} // inline namespace v1 +} // namespace tbb + +#endif // __TBB_concurrent_map_H diff --git a/third_party/tbb/concurrent_monitor.h b/third_party/tbb/concurrent_monitor.h new file mode 100644 index 000000000..539706ed1 --- /dev/null +++ b/third_party/tbb/concurrent_monitor.h @@ -0,0 +1,489 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_concurrent_monitor_H +#define __TBB_concurrent_monitor_H + +#include "third_party/tbb/spin_mutex.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_aligned_space.h" +#include "third_party/tbb/concurrent_monitor_mutex.h" +#include "third_party/tbb/semaphore.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace r1 { + +//! Circular doubly-linked list with sentinel +/** head.next points to the front and head.prev points to the back */ +class circular_doubly_linked_list_with_sentinel { +public: + struct base_node { + base_node* next; + base_node* prev; + + constexpr base_node(base_node* n, base_node* p) : next(n), prev(p) {} + explicit base_node() : next((base_node*)(uintptr_t)0xcdcdcdcd), prev((base_node*)(uintptr_t)0xcdcdcdcd) {} + }; + + // ctor + constexpr circular_doubly_linked_list_with_sentinel() : count(0), head(&head, &head) {} + + circular_doubly_linked_list_with_sentinel(const circular_doubly_linked_list_with_sentinel&) = delete; + circular_doubly_linked_list_with_sentinel& operator=(const circular_doubly_linked_list_with_sentinel&) = delete; + + inline std::size_t size() const { return count.load(std::memory_order_relaxed); } + inline bool empty() const { return size() == 0; } + inline base_node* front() const { return head.next; } + inline base_node* last() const { return head.prev; } + inline const base_node* end() const { return &head; } + + //! add to the back of the list + inline void add( base_node* n ) { + count.store(count.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); + n->prev = head.prev; + n->next = &head; + head.prev->next = n; + head.prev = n; + } + + //! remove node 'n' + inline void remove( base_node& n ) { + __TBB_ASSERT(count.load(std::memory_order_relaxed) > 0, "attempt to remove an item from an empty list"); + count.store(count.load( std::memory_order_relaxed ) - 1, std::memory_order_relaxed); + n.prev->next = n.next; + n.next->prev = n.prev; + } + + //! move all elements to 'lst' and initialize the 'this' list + inline void flush_to( circular_doubly_linked_list_with_sentinel& lst ) { + const std::size_t l_count = size(); + if (l_count > 0) { + lst.count.store(l_count, std::memory_order_relaxed); + lst.head.next = head.next; + lst.head.prev = head.prev; + head.next->prev = &lst.head; + head.prev->next = &lst.head; + clear(); + } + } + + void clear() { + head.next = &head; + head.prev = &head; + count.store(0, std::memory_order_relaxed); + } +private: + std::atomic count; + base_node head; +}; + +using base_list = circular_doubly_linked_list_with_sentinel; +using base_node = circular_doubly_linked_list_with_sentinel::base_node; + +template +class concurrent_monitor_base; + +template +class wait_node : public base_node { +public: + +#if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900 + wait_node(Context ctx) : my_context(ctx), my_is_in_list(false) {} +#else + wait_node(Context ctx) : my_context(ctx) {} +#endif + + virtual ~wait_node() = default; + + virtual void init() { + __TBB_ASSERT(!my_initialized, nullptr); + my_initialized = true; + } + + virtual void wait() = 0; + + virtual void reset() { + __TBB_ASSERT(my_skipped_wakeup, nullptr); + my_skipped_wakeup = false; + } + + virtual void notify() = 0; + +protected: + friend class concurrent_monitor_base; + friend class thread_data; + + Context my_context{}; +#if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900 + std::atomic my_is_in_list; +#else + std::atomic my_is_in_list{false}; +#endif + + bool my_initialized{false}; + bool my_skipped_wakeup{false}; + bool my_aborted{false}; + unsigned my_epoch{0}; +}; + +template +class sleep_node : public wait_node { + using base_type = wait_node; +public: + using base_type::base_type; + + ~sleep_node() override { + if (this->my_initialized) { + if (this->my_skipped_wakeup) semaphore().P(); + semaphore().~binary_semaphore(); + } + } + + binary_semaphore& semaphore() { return *sema.begin(); } + + void init() override { + if (!this->my_initialized) { + new (sema.begin()) binary_semaphore; + base_type::init(); + } + } + + void wait() override { + __TBB_ASSERT(this->my_initialized, + "Use of commit_wait() without prior prepare_wait()"); + semaphore().P(); + __TBB_ASSERT(!this->my_is_in_list.load(std::memory_order_relaxed), "Still in the queue?"); + if (this->my_aborted) + throw_exception(exception_id::user_abort); + } + + void reset() override { + base_type::reset(); + semaphore().P(); + } + + void notify() override { + semaphore().V(); + } + +private: + tbb::detail::aligned_space sema; +}; + +//! concurrent_monitor +/** fine-grained concurrent_monitor implementation */ +template +class concurrent_monitor_base { +public: + //! ctor + constexpr concurrent_monitor_base() {} + //! dtor + ~concurrent_monitor_base() = default; + + concurrent_monitor_base(const concurrent_monitor_base&) = delete; + concurrent_monitor_base& operator=(const concurrent_monitor_base&) = delete; + + //! prepare wait by inserting 'thr' into the wait queue + void prepare_wait( wait_node& node) { + // TODO: consider making even more lazy instantiation of the semaphore, that is only when it is actually needed, e.g. move it in node::wait() + if (!node.my_initialized) { + node.init(); + } + // this is good place to pump previous skipped wakeup + else if (node.my_skipped_wakeup) { + node.reset(); + } + + node.my_is_in_list.store(true, std::memory_order_relaxed); + + { + concurrent_monitor_mutex::scoped_lock l(my_mutex); + node.my_epoch = my_epoch.load(std::memory_order_relaxed); + my_waitset.add(&node); + } + + // Prepare wait guarantees Write Read memory barrier. + // In C++ only full fence covers this type of barrier. + atomic_fence_seq_cst(); + } + + //! Commit wait if event count has not changed; otherwise, cancel wait. + /** Returns true if committed, false if canceled. */ + inline bool commit_wait( wait_node& node ) { + const bool do_it = node.my_epoch == my_epoch.load(std::memory_order_relaxed); + // this check is just an optimization + if (do_it) { + node.wait(); + } else { + cancel_wait( node ); + } + return do_it; + } + + //! Cancel the wait. Removes the thread from the wait queue if not removed yet. + void cancel_wait( wait_node& node ) { + // possible skipped wakeup will be pumped in the following prepare_wait() + node.my_skipped_wakeup = true; + // try to remove node from waitset + // Cancel wait guarantees acquire memory barrier. + bool in_list = node.my_is_in_list.load(std::memory_order_acquire); + if (in_list) { + concurrent_monitor_mutex::scoped_lock l(my_mutex); + if (node.my_is_in_list.load(std::memory_order_relaxed)) { + my_waitset.remove(node); + // node is removed from waitset, so there will be no wakeup + node.my_is_in_list.store(false, std::memory_order_relaxed); + node.my_skipped_wakeup = false; + } + } + } + + //! Wait for a condition to be satisfied with waiting-on my_context + template + bool wait(Pred&& pred, NodeType&& node) { + prepare_wait(node); + while (!guarded_call(std::forward(pred), node)) { + if (commit_wait(node)) { + return true; + } + + prepare_wait(node); + } + + cancel_wait(node); + return false; + } + + //! Notify one thread about the event + void notify_one() { + atomic_fence_seq_cst(); + notify_one_relaxed(); + } + + //! Notify one thread about the event. Relaxed version. + void notify_one_relaxed() { + if (my_waitset.empty()) { + return; + } + + base_node* n; + const base_node* end = my_waitset.end(); + { + concurrent_monitor_mutex::scoped_lock l(my_mutex); + my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); + n = my_waitset.front(); + if (n != end) { + my_waitset.remove(*n); + to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed); + } + } + + if (n != end) { + to_wait_node(n)->notify(); + } + } + + //! Notify all waiting threads of the event + void notify_all() { + atomic_fence_seq_cst(); + notify_all_relaxed(); + } + + // ! Notify all waiting threads of the event; Relaxed version + void notify_all_relaxed() { + if (my_waitset.empty()) { + return; + } + + base_list temp; + const base_node* end; + { + concurrent_monitor_mutex::scoped_lock l(my_mutex); + my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); + // TODO: Possible optimization, don't change node state under lock, just do flush + my_waitset.flush_to(temp); + end = temp.end(); + for (base_node* n = temp.front(); n != end; n = n->next) { + to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed); + } + } + + base_node* nxt; + for (base_node* n = temp.front(); n != end; n=nxt) { + nxt = n->next; + to_wait_node(n)->notify(); + } +#if TBB_USE_ASSERT + temp.clear(); +#endif + } + + //! Notify waiting threads of the event that satisfies the given predicate + template + void notify( const P& predicate ) { + atomic_fence_seq_cst(); + notify_relaxed( predicate ); + } + + //! Notify waiting threads of the event that satisfies the given predicate; + //! the predicate is called under the lock. Relaxed version. + template + void notify_relaxed( const P& predicate ) { + if (my_waitset.empty()) { + return; + } + + base_list temp; + base_node* nxt; + const base_node* end = my_waitset.end(); + { + concurrent_monitor_mutex::scoped_lock l(my_mutex); + my_epoch.store(my_epoch.load( std::memory_order_relaxed ) + 1, std::memory_order_relaxed); + for (base_node* n = my_waitset.last(); n != end; n = nxt) { + nxt = n->prev; + auto* node = static_cast*>(n); + if (predicate(node->my_context)) { + my_waitset.remove(*n); + node->my_is_in_list.store(false, std::memory_order_relaxed); + temp.add(n); + } + } + } + + end = temp.end(); + for (base_node* n=temp.front(); n != end; n = nxt) { + nxt = n->next; + to_wait_node(n)->notify(); + } +#if TBB_USE_ASSERT + temp.clear(); +#endif + } + + //! Notify waiting threads of the event that satisfies the given predicate; + //! the predicate is called under the lock. Relaxed version. + template + void notify_one_relaxed( const P& predicate ) { + if (my_waitset.empty()) { + return; + } + + base_node* tmp = nullptr; + base_node* next{}; + const base_node* end = my_waitset.end(); + { + concurrent_monitor_mutex::scoped_lock l(my_mutex); + my_epoch.store(my_epoch.load( std::memory_order_relaxed ) + 1, std::memory_order_relaxed); + for (base_node* n = my_waitset.last(); n != end; n = next) { + next = n->prev; + auto* node = static_cast*>(n); + if (predicate(node->my_context)) { + my_waitset.remove(*n); + node->my_is_in_list.store(false, std::memory_order_relaxed); + tmp = n; + break; + } + } + } + + if (tmp) { + to_wait_node(tmp)->notify(); + } + } + + //! Abort any sleeping threads at the time of the call + void abort_all() { + atomic_fence_seq_cst(); + abort_all_relaxed(); + } + + //! Abort any sleeping threads at the time of the call; Relaxed version + void abort_all_relaxed() { + if (my_waitset.empty()) { + return; + } + + base_list temp; + const base_node* end; + { + concurrent_monitor_mutex::scoped_lock l(my_mutex); + my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); + my_waitset.flush_to(temp); + end = temp.end(); + for (base_node* n = temp.front(); n != end; n = n->next) { + to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed); + } + } + + base_node* nxt; + for (base_node* n = temp.front(); n != end; n = nxt) { + nxt = n->next; + to_wait_node(n)->my_aborted = true; + to_wait_node(n)->notify(); + } +#if TBB_USE_ASSERT + temp.clear(); +#endif + } + + void destroy() { + this->abort_all(); + my_mutex.destroy(); + __TBB_ASSERT(this->my_waitset.empty(), "waitset not empty?"); + } + +private: + template + bool guarded_call(Pred&& predicate, NodeType& node) { + bool res = false; + tbb::detail::d0::try_call( [&] { + res = std::forward(predicate)(); + }).on_exception( [&] { + cancel_wait(node); + }); + + return res; + } + + concurrent_monitor_mutex my_mutex{}; + base_list my_waitset{}; + std::atomic my_epoch{}; + + wait_node* to_wait_node( base_node* node ) { return static_cast*>(node); } +}; + +class concurrent_monitor : public concurrent_monitor_base { + using base_type = concurrent_monitor_base; +public: + using base_type::base_type; + + ~concurrent_monitor() { + destroy(); + } + + /** per-thread descriptor for concurrent_monitor */ + using thread_context = sleep_node; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* __TBB_concurrent_monitor_H */ diff --git a/third_party/tbb/concurrent_monitor_mutex.h b/third_party/tbb/concurrent_monitor_mutex.h new file mode 100644 index 000000000..14d6317e7 --- /dev/null +++ b/third_party/tbb/concurrent_monitor_mutex.h @@ -0,0 +1,114 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_monitor_mutex_H +#define __TBB_monitor_mutex_H + +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_aligned_space.h" +#include "third_party/tbb/semaphore.h" + +#include "third_party/libcxx/mutex" + +namespace tbb { +namespace detail { +namespace r1 { + +class concurrent_monitor_mutex { +public: + using scoped_lock = std::lock_guard; + + constexpr concurrent_monitor_mutex() {} + + ~concurrent_monitor_mutex() = default; + + void destroy() { +#if !__TBB_USE_FUTEX + if (my_init_flag.load(std::memory_order_relaxed)) { + get_semaphore().~semaphore(); + } +#endif + } + + void lock() { + auto wakeup_condition = [&] { + return my_flag.load(std::memory_order_relaxed) == 0; + }; + + while (my_flag.exchange(1)) { + if (!timed_spin_wait_until(wakeup_condition)) { + ++my_waiters; + while (!wakeup_condition()) { + wait(); + } + --my_waiters; + } + } + } + + void unlock() { + my_flag.exchange(0); // full fence, so the next load is relaxed + if (my_waiters.load(std::memory_order_relaxed)) { + wakeup(); + } + } + +private: + void wait() { +#if __TBB_USE_FUTEX + futex_wait(&my_flag, 1); +#else + get_semaphore().P(); +#endif + } + + void wakeup() { +#if __TBB_USE_FUTEX + futex_wakeup_one(&my_flag); +#else + get_semaphore().V(); +#endif + } + + // The flag should be int for the futex operations + std::atomic my_flag{0}; + std::atomic my_waiters{0}; + +#if !__TBB_USE_FUTEX + semaphore& get_semaphore() { + if (!my_init_flag.load(std::memory_order_acquire)) { + std::lock_guard lock(my_init_mutex); + if (!my_init_flag.load(std::memory_order_relaxed)) { + new (my_semaphore.begin()) semaphore(); + my_init_flag.store(true, std::memory_order_release); + } + } + + return *my_semaphore.begin(); + } + + static std::mutex my_init_mutex; + std::atomic my_init_flag{false}; + aligned_space my_semaphore{}; +#endif +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_monitor_mutex_H diff --git a/third_party/tbb/concurrent_priority_queue.h b/third_party/tbb/concurrent_priority_queue.h new file mode 100644 index 000000000..86e915dee --- /dev/null +++ b/third_party/tbb/concurrent_priority_queue.h @@ -0,0 +1,491 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_concurrent_priority_queue_H +#define __TBB_concurrent_priority_queue_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_aggregator.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_allocator_traits.h" +#include "third_party/tbb/detail/_range_common.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_containers_helpers.h" +#include "third_party/tbb/cache_aligned_allocator.h" +#include "third_party/libcxx/vector" +#include "third_party/libcxx/iterator" +#include "third_party/libcxx/functional" +#include "third_party/libcxx/utility" +#include "third_party/libcxx/initializer_list" +#include "third_party/libcxx/type_traits" + +namespace tbb { +namespace detail { +namespace d1 { + +template , typename Allocator = cache_aligned_allocator> +class concurrent_priority_queue { +public: + using value_type = T; + using reference = T&; + using const_reference = const T&; + + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + using allocator_type = Allocator; + + concurrent_priority_queue() : concurrent_priority_queue(allocator_type{}) {} + + explicit concurrent_priority_queue( const allocator_type& alloc ) + : mark(0), my_size(0), my_compare(), data(alloc) + { + my_aggregator.initialize_handler(functor{this}); + } + + explicit concurrent_priority_queue( const Compare& compare, const allocator_type& alloc = allocator_type() ) + : mark(0), my_size(0), my_compare(compare), data(alloc) + { + my_aggregator.initialize_handler(functor{this}); + } + + explicit concurrent_priority_queue( size_type init_capacity, const allocator_type& alloc = allocator_type() ) + : mark(0), my_size(0), my_compare(), data(alloc) + { + data.reserve(init_capacity); + my_aggregator.initialize_handler(functor{this}); + } + + explicit concurrent_priority_queue( size_type init_capacity, const Compare& compare, const allocator_type& alloc = allocator_type() ) + : mark(0), my_size(0), my_compare(compare), data(alloc) + { + data.reserve(init_capacity); + my_aggregator.initialize_handler(functor{this}); + } + + template + concurrent_priority_queue( InputIterator begin, InputIterator end, const Compare& compare, const allocator_type& alloc = allocator_type() ) + : mark(0), my_compare(compare), data(begin, end, alloc) + { + my_aggregator.initialize_handler(functor{this}); + heapify(); + my_size.store(data.size(), std::memory_order_relaxed); + } + + template + concurrent_priority_queue( InputIterator begin, InputIterator end, const allocator_type& alloc = allocator_type() ) + : concurrent_priority_queue(begin, end, Compare(), alloc) {} + + concurrent_priority_queue( std::initializer_list init, const Compare& compare, const allocator_type& alloc = allocator_type() ) + : concurrent_priority_queue(init.begin(), init.end(), compare, alloc) {} + + concurrent_priority_queue( std::initializer_list init, const allocator_type& alloc = allocator_type() ) + : concurrent_priority_queue(init, Compare(), alloc) {} + + concurrent_priority_queue( const concurrent_priority_queue& other ) + : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare), + data(other.data) + { + my_aggregator.initialize_handler(functor{this}); + } + + concurrent_priority_queue( const concurrent_priority_queue& other, const allocator_type& alloc ) + : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare), + data(other.data, alloc) + { + my_aggregator.initialize_handler(functor{this}); + } + + concurrent_priority_queue( concurrent_priority_queue&& other ) + : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare), + data(std::move(other.data)) + { + my_aggregator.initialize_handler(functor{this}); + } + + concurrent_priority_queue( concurrent_priority_queue&& other, const allocator_type& alloc ) + : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare), + data(std::move(other.data), alloc) + { + my_aggregator.initialize_handler(functor{this}); + } + + concurrent_priority_queue& operator=( const concurrent_priority_queue& other ) { + if (this != &other) { + data = other.data; + mark = other.mark; + my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); + } + return *this; + } + + concurrent_priority_queue& operator=( concurrent_priority_queue&& other ) { + if (this != &other) { + // TODO: check if exceptions from std::vector::operator=(vector&&) should be handled separately + data = std::move(other.data); + mark = other.mark; + my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); + } + return *this; + } + + concurrent_priority_queue& operator=( std::initializer_list init ) { + assign(init.begin(), init.end()); + return *this; + } + + template + void assign( InputIterator begin, InputIterator end ) { + data.assign(begin, end); + mark = 0; + my_size.store(data.size(), std::memory_order_relaxed); + heapify(); + } + + void assign( std::initializer_list init ) { + assign(init.begin(), init.end()); + } + + /* Returned value may not reflect results of pending operations. + This operation reads shared data and will trigger a race condition. */ + __TBB_nodiscard bool empty() const { return size() == 0; } + + // Returns the current number of elements contained in the queue + /* Returned value may not reflect results of pending operations. + This operation reads shared data and will trigger a race condition. */ + size_type size() const { return my_size.load(std::memory_order_relaxed); } + + /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */ + void push( const value_type& value ) { + cpq_operation op_data(value, PUSH_OP); + my_aggregator.execute(&op_data); + if (op_data.status == FAILED) + throw_exception(exception_id::bad_alloc); + } + + /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */ + void push( value_type&& value ) { + cpq_operation op_data(value, PUSH_RVALUE_OP); + my_aggregator.execute(&op_data); + if (op_data.status == FAILED) + throw_exception(exception_id::bad_alloc); + } + + /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */ + template + void emplace( Args&&... args ) { + // TODO: support uses allocator construction in this place + push(value_type(std::forward(args)...)); + } + + // Gets a reference to and removes highest priority element + /* If a highest priority element was found, sets elem and returns true, + otherwise returns false. + This operation can be safely used concurrently with other push, try_pop or emplace operations. */ + bool try_pop( value_type& value ) { + cpq_operation op_data(value, POP_OP); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + // This operation affects the whole container => it is not thread-safe + void clear() { + data.clear(); + mark = 0; + my_size.store(0, std::memory_order_relaxed); + } + + // This operation affects the whole container => it is not thread-safe + void swap( concurrent_priority_queue& other ) { + if (this != &other) { + using std::swap; + swap(data, other.data); + swap(mark, other.mark); + + size_type sz = my_size.load(std::memory_order_relaxed); + my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.my_size.store(sz, std::memory_order_relaxed); + } + } + + allocator_type get_allocator() const { return data.get_allocator(); } +private: + enum operation_type {INVALID_OP, PUSH_OP, POP_OP, PUSH_RVALUE_OP}; + enum operation_status {WAIT = 0, SUCCEEDED, FAILED}; + + class cpq_operation : public aggregated_operation { + public: + operation_type type; + union { + value_type* elem; + size_type sz; + }; + cpq_operation( const value_type& value, operation_type t ) + : type(t), elem(const_cast(&value)) {} + }; // class cpq_operation + + class functor { + concurrent_priority_queue* my_cpq; + public: + functor() : my_cpq(nullptr) {} + functor( concurrent_priority_queue* cpq ) : my_cpq(cpq) {} + + void operator()(cpq_operation* op_list) { + __TBB_ASSERT(my_cpq != nullptr, "Invalid functor"); + my_cpq->handle_operations(op_list); + } + }; // class functor + + void handle_operations( cpq_operation* op_list ) { + call_itt_notify(acquired, this); + cpq_operation* tmp, *pop_list = nullptr; + __TBB_ASSERT(mark == data.size(), nullptr); + + // First pass processes all constant (amortized; reallocation may happen) time pushes and pops. + while(op_list) { + // ITT note: &(op_list->status) tag is used to cover accesses to op_list + // node. This thread is going to handle the operation, and so will acquire it + // and perform the associated operation w/o triggering a race condition; the + // thread that created the operation is waiting on the status field, so when + // this thread is done with the operation, it will perform a + // store_with_release to give control back to the waiting thread in + // aggregator::insert_operation. + // TODO: enable + call_itt_notify(acquired, &(op_list->status)); + __TBB_ASSERT(op_list->type != INVALID_OP, nullptr); + + tmp = op_list; + op_list = op_list->next.load(std::memory_order_relaxed); + if (tmp->type == POP_OP) { + if (mark < data.size() && + my_compare(data[0], data.back())) + { + // there are newly pushed elems and the last one is higher than top + *(tmp->elem) = std::move(data.back()); + my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed); + tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release); + + data.pop_back(); + __TBB_ASSERT(mark <= data.size(), nullptr); + } else { // no convenient item to pop; postpone + tmp->next.store(pop_list, std::memory_order_relaxed); + pop_list = tmp; + } + } else { // PUSH_OP or PUSH_RVALUE_OP + __TBB_ASSERT(tmp->type == PUSH_OP || tmp->type == PUSH_RVALUE_OP, "Unknown operation"); +#if TBB_USE_EXCEPTIONS + try +#endif + { + if (tmp->type == PUSH_OP) { + push_back_helper(*(tmp->elem)); + } else { + data.push_back(std::move(*(tmp->elem))); + } + my_size.store(my_size.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); + tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release); + } +#if TBB_USE_EXCEPTIONS + catch(...) { + tmp->status.store(uintptr_t(FAILED), std::memory_order_release); + } +#endif + } + } + + // Second pass processes pop operations + while(pop_list) { + tmp = pop_list; + pop_list = pop_list->next.load(std::memory_order_relaxed); + __TBB_ASSERT(tmp->type == POP_OP, nullptr); + if (data.empty()) { + tmp->status.store(uintptr_t(FAILED), std::memory_order_release); + } else { + __TBB_ASSERT(mark <= data.size(), nullptr); + if (mark < data.size() && + my_compare(data[0], data.back())) + { + // there are newly pushed elems and the last one is higher than top + *(tmp->elem) = std::move(data.back()); + my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed); + tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release); + data.pop_back(); + } else { // extract top and push last element down heap + *(tmp->elem) = std::move(data[0]); + my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed); + tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release); + reheap(); + } + } + } + + // heapify any leftover pushed elements before doing the next + // batch of operations + if (mark < data.size()) heapify(); + __TBB_ASSERT(mark == data.size(), nullptr); + call_itt_notify(releasing, this); + } + + // Merge unsorted elements into heap + void heapify() { + if (!mark && data.size() > 0) mark = 1; + for (; mark < data.size(); ++mark) { + // for each unheapified element under size + size_type cur_pos = mark; + value_type to_place = std::move(data[mark]); + do { // push to_place up the heap + size_type parent = (cur_pos - 1) >> 1; + if (!my_compare(data[parent], to_place)) + break; + data[cur_pos] = std::move(data[parent]); + cur_pos = parent; + } while(cur_pos); + data[cur_pos] = std::move(to_place); + } + } + + // Re-heapify after an extraction + // Re-heapify by pushing last element down the heap from the root. + void reheap() { + size_type cur_pos = 0, child = 1; + + while(child < mark) { + size_type target = child; + if (child + 1 < mark && my_compare(data[child], data[child + 1])) + ++target; + // target now has the higher priority child + if (my_compare(data[target], data.back())) + break; + data[cur_pos] = std::move(data[target]); + cur_pos = target; + child = (cur_pos << 1) + 1; + } + if (cur_pos != data.size() - 1) + data[cur_pos] = std::move(data.back()); + data.pop_back(); + if (mark > data.size()) mark = data.size(); + } + + void push_back_helper( const T& value ) { + push_back_helper_impl(value, std::is_copy_constructible{}); + } + + void push_back_helper_impl( const T& value, /*is_copy_constructible = */std::true_type ) { + data.push_back(value); + } + + void push_back_helper_impl( const T&, /*is_copy_constructible = */std::false_type ) { + __TBB_ASSERT(false, "error: calling tbb::concurrent_priority_queue.push(const value_type&) for move-only type"); + } + + using aggregator_type = aggregator; + + aggregator_type my_aggregator; + // Padding added to avoid false sharing + char padding1[max_nfs_size - sizeof(aggregator_type)]; + // The point at which unsorted elements begin + size_type mark; + std::atomic my_size; + Compare my_compare; + + // Padding added to avoid false sharing + char padding2[max_nfs_size - (2*sizeof(size_type)) - sizeof(Compare)]; + //! Storage for the heap of elements in queue, plus unheapified elements + /** data has the following structure: + + binary unheapified + heap elements + ____|_______|____ + | | | + v v v + [_|...|_|_|...|_| |...| ] + 0 ^ ^ ^ + | | |__capacity + | |__my_size + |__mark + + Thus, data stores the binary heap starting at position 0 through + mark-1 (it may be empty). Then there are 0 or more elements + that have not yet been inserted into the heap, in positions + mark through my_size-1. */ + + using vector_type = std::vector; + vector_type data; + + friend bool operator==( const concurrent_priority_queue& lhs, + const concurrent_priority_queue& rhs ) + { + return lhs.data == rhs.data; + } + +#if !__TBB_CPP20_COMPARISONS_PRESENT + friend bool operator!=( const concurrent_priority_queue& lhs, + const concurrent_priority_queue& rhs ) + { + return !(lhs == rhs); + } +#endif +}; // class concurrent_priority_queue + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT +template >, + typename Alloc = tbb::cache_aligned_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_priority_queue( It, It, Comp = Comp(), Alloc = Alloc() ) +-> concurrent_priority_queue, Comp, Alloc>; + +template >, + typename = std::enable_if_t>> +concurrent_priority_queue( It, It, Alloc ) +-> concurrent_priority_queue, std::less>, Alloc>; + +template , + typename Alloc = tbb::cache_aligned_allocator, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_priority_queue( std::initializer_list, Comp = Comp(), Alloc = Alloc() ) +-> concurrent_priority_queue; + +template >> +concurrent_priority_queue( std::initializer_list, Alloc ) +-> concurrent_priority_queue, Alloc>; + +#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template +void swap( concurrent_priority_queue& lhs, + concurrent_priority_queue& rhs ) +{ + lhs.swap(rhs); +} + +} // namespace d1 +} // namespace detail +inline namespace v1 { +using detail::d1::concurrent_priority_queue; + +} // inline namespace v1 +} // namespace tbb + +#endif // __TBB_concurrent_priority_queue_H diff --git a/third_party/tbb/concurrent_queue.h b/third_party/tbb/concurrent_queue.h new file mode 100644 index 000000000..2cceab80f --- /dev/null +++ b/third_party/tbb/concurrent_queue.h @@ -0,0 +1,701 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_concurrent_queue_H +#define __TBB_concurrent_queue_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_concurrent_queue_base.h" +#include "third_party/tbb/detail/_allocator_traits.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_containers_helpers.h" +#include "third_party/tbb/cache_aligned_allocator.h" + +namespace tbb { +namespace detail { +namespace d2 { + +template +std::pair internal_try_pop_impl(void* dst, QueueRep& queue, Allocator& alloc ) { + ticket_type ticket{}; + do { + // Basically, we need to read `head_counter` before `tail_counter`. To achieve it we build happens-before on `head_counter` + ticket = queue.head_counter.load(std::memory_order_acquire); + do { + if (static_cast(queue.tail_counter.load(std::memory_order_relaxed) - ticket) <= 0) { // queue is empty + // Queue is empty + return { false, ticket }; + } + // Queue had item with ticket k when we looked. Attempt to get that item. + // Another thread snatched the item, retry. + } while (!queue.head_counter.compare_exchange_strong(ticket, ticket + 1)); + } while (!queue.choose(ticket).pop(dst, ticket, queue, alloc)); + return { true, ticket }; +} + +// A high-performance thread-safe non-blocking concurrent queue. +// Multiple threads may each push and pop concurrently. +// Assignment construction is not allowed. +template > +class concurrent_queue { + using allocator_traits_type = tbb::detail::allocator_traits; + using queue_representation_type = concurrent_queue_rep; + using queue_allocator_type = typename allocator_traits_type::template rebind_alloc; + using queue_allocator_traits = tbb::detail::allocator_traits; +public: + using size_type = std::size_t; + using value_type = T; + using reference = T&; + using const_reference = const T&; + using difference_type = std::ptrdiff_t; + + using allocator_type = Allocator; + using pointer = typename allocator_traits_type::pointer; + using const_pointer = typename allocator_traits_type::const_pointer; + + using iterator = concurrent_queue_iterator; + using const_iterator = concurrent_queue_iterator; + + concurrent_queue() : concurrent_queue(allocator_type()) {} + + explicit concurrent_queue(const allocator_type& a) : + my_allocator(a), my_queue_representation(nullptr) + { + my_queue_representation = static_cast(r1::cache_aligned_allocate(sizeof(queue_representation_type))); + queue_allocator_traits::construct(my_allocator, my_queue_representation); + + __TBB_ASSERT(is_aligned(my_queue_representation, max_nfs_size), "alignment error" ); + __TBB_ASSERT(is_aligned(&my_queue_representation->head_counter, max_nfs_size), "alignment error" ); + __TBB_ASSERT(is_aligned(&my_queue_representation->tail_counter, max_nfs_size), "alignment error" ); + __TBB_ASSERT(is_aligned(&my_queue_representation->array, max_nfs_size), "alignment error" ); + } + + template + concurrent_queue(InputIterator begin, InputIterator end, const allocator_type& a = allocator_type()) : + concurrent_queue(a) + { + for (; begin != end; ++begin) + push(*begin); + } + + concurrent_queue( std::initializer_list init, const allocator_type& alloc = allocator_type() ) : + concurrent_queue(init.begin(), init.end(), alloc) + {} + + concurrent_queue(const concurrent_queue& src, const allocator_type& a) : + concurrent_queue(a) + { + my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item); + } + + concurrent_queue(const concurrent_queue& src) : + concurrent_queue(queue_allocator_traits::select_on_container_copy_construction(src.get_allocator())) + { + my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item); + } + + // Move constructors + concurrent_queue(concurrent_queue&& src) : + concurrent_queue(std::move(src.my_allocator)) + { + internal_swap(src); + } + + concurrent_queue(concurrent_queue&& src, const allocator_type& a) : + concurrent_queue(a) + { + // checking that memory allocated by one instance of allocator can be deallocated + // with another + if (my_allocator == src.my_allocator) { + internal_swap(src); + } else { + // allocators are different => performing per-element move + my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item); + src.clear(); + } + } + + // Destroy queue + ~concurrent_queue() { + clear(); + my_queue_representation->clear(my_allocator); + queue_allocator_traits::destroy(my_allocator, my_queue_representation); + r1::cache_aligned_deallocate(my_queue_representation); + } + + concurrent_queue& operator=( const concurrent_queue& other ) { + //TODO: implement support for std::allocator_traits::propagate_on_container_copy_assignment + if (my_queue_representation != other.my_queue_representation) { + clear(); + my_allocator = other.my_allocator; + my_queue_representation->assign(*other.my_queue_representation, my_allocator, copy_construct_item); + } + return *this; + } + + concurrent_queue& operator=( concurrent_queue&& other ) { + //TODO: implement support for std::allocator_traits::propagate_on_container_move_assignment + if (my_queue_representation != other.my_queue_representation) { + clear(); + if (my_allocator == other.my_allocator) { + internal_swap(other); + } else { + my_queue_representation->assign(*other.my_queue_representation, other.my_allocator, move_construct_item); + other.clear(); + my_allocator = std::move(other.my_allocator); + } + } + return *this; + } + + concurrent_queue& operator=( std::initializer_list init ) { + assign(init); + return *this; + } + + template + void assign( InputIterator first, InputIterator last ) { + concurrent_queue src(first, last); + clear(); + my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item); + } + + void assign( std::initializer_list init ) { + assign(init.begin(), init.end()); + } + + void swap ( concurrent_queue& other ) { + //TODO: implement support for std::allocator_traits::propagate_on_container_swap + __TBB_ASSERT(my_allocator == other.my_allocator, "unequal allocators"); + internal_swap(other); + } + + // Enqueue an item at tail of queue. + void push(const T& value) { + internal_push(value); + } + + void push(T&& value) { + internal_push(std::move(value)); + } + + template + void emplace( Args&&... args ) { + internal_push(std::forward(args)...); + } + + // Attempt to dequeue an item from head of queue. + /** Does not wait for item to become available. + Returns true if successful; false otherwise. */ + bool try_pop( T& result ) { + return internal_try_pop(&result); + } + + // Return the number of items in the queue; thread unsafe + size_type unsafe_size() const { + std::ptrdiff_t size = my_queue_representation->size(); + return size < 0 ? 0 : size_type(size); + } + + // Equivalent to size()==0. + __TBB_nodiscard bool empty() const { + return my_queue_representation->empty(); + } + + // Clear the queue. not thread-safe. + void clear() { + my_queue_representation->clear(my_allocator); + } + + // Return allocator object + allocator_type get_allocator() const { return my_allocator; } + + //------------------------------------------------------------------------ + // The iterators are intended only for debugging. They are slow and not thread safe. + //------------------------------------------------------------------------ + + iterator unsafe_begin() { return concurrent_queue_iterator_provider::get(*this); } + iterator unsafe_end() { return iterator(); } + const_iterator unsafe_begin() const { return concurrent_queue_iterator_provider::get(*this); } + const_iterator unsafe_end() const { return const_iterator(); } + const_iterator unsafe_cbegin() const { return concurrent_queue_iterator_provider::get(*this); } + const_iterator unsafe_cend() const { return const_iterator(); } + +private: + void internal_swap(concurrent_queue& src) { + using std::swap; + swap(my_queue_representation, src.my_queue_representation); + } + + template + void internal_push( Args&&... args ) { + ticket_type k = my_queue_representation->tail_counter++; + my_queue_representation->choose(k).push(k, *my_queue_representation, my_allocator, std::forward(args)...); + } + + bool internal_try_pop( void* dst ) { + return internal_try_pop_impl(dst, *my_queue_representation, my_allocator).first; + } + + template + friend class concurrent_queue_iterator; + + static void copy_construct_item(T* location, const void* src) { + // TODO: use allocator_traits for copy construction + new (location) value_type(*static_cast(src)); + // queue_allocator_traits::construct(my_allocator, location, *static_cast(src)); + } + + static void move_construct_item(T* location, const void* src) { + // TODO: use allocator_traits for move construction + new (location) value_type(std::move(*static_cast(const_cast(src)))); + } + + queue_allocator_type my_allocator; + queue_representation_type* my_queue_representation; + + friend void swap( concurrent_queue& lhs, concurrent_queue& rhs ) { + lhs.swap(rhs); + } + + friend bool operator==( const concurrent_queue& lhs, const concurrent_queue& rhs ) { + return lhs.unsafe_size() == rhs.unsafe_size() && std::equal(lhs.unsafe_begin(), lhs.unsafe_end(), rhs.unsafe_begin()); + } + +#if !__TBB_CPP20_COMPARISONS_PRESENT + friend bool operator!=( const concurrent_queue& lhs, const concurrent_queue& rhs ) { + return !(lhs == rhs); + } +#endif // __TBB_CPP20_COMPARISONS_PRESENT +}; // class concurrent_queue + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT +// Deduction guide for the constructor from two iterators +template >, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_queue( It, It, Alloc = Alloc() ) +-> concurrent_queue, Alloc>; + +#endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */ + +class concurrent_monitor; + +// The concurrent monitor tags for concurrent_bounded_queue. +static constexpr std::size_t cbq_slots_avail_tag = 0; +static constexpr std::size_t cbq_items_avail_tag = 1; +} // namespace d2 + + +namespace r1 { + class concurrent_monitor; + + TBB_EXPORT std::uint8_t* __TBB_EXPORTED_FUNC allocate_bounded_queue_rep( std::size_t queue_rep_size ); + TBB_EXPORT void __TBB_EXPORTED_FUNC deallocate_bounded_queue_rep( std::uint8_t* mem, std::size_t queue_rep_size ); + TBB_EXPORT void __TBB_EXPORTED_FUNC abort_bounded_queue_monitors( concurrent_monitor* monitors ); + TBB_EXPORT void __TBB_EXPORTED_FUNC notify_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag + , std::size_t ticket ); + TBB_EXPORT void __TBB_EXPORTED_FUNC wait_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag, + std::ptrdiff_t target, d1::delegate_base& predicate ); +} // namespace r1 + + +namespace d2 { +// A high-performance thread-safe blocking concurrent bounded queue. +// Supports boundedness and blocking semantics. +// Multiple threads may each push and pop concurrently. +// Assignment construction is not allowed. +template > +class concurrent_bounded_queue { + using allocator_traits_type = tbb::detail::allocator_traits; + using queue_representation_type = concurrent_queue_rep; + using queue_allocator_type = typename allocator_traits_type::template rebind_alloc; + using queue_allocator_traits = tbb::detail::allocator_traits; + + template + void internal_wait(r1::concurrent_monitor* monitors, std::size_t monitor_tag, std::ptrdiff_t target, FuncType pred) { + d1::delegated_function func(pred); + r1::wait_bounded_queue_monitor(monitors, monitor_tag, target, func); + } +public: + using size_type = std::ptrdiff_t; + using value_type = T; + using reference = T&; + using const_reference = const T&; + using difference_type = std::ptrdiff_t; + + using allocator_type = Allocator; + using pointer = typename allocator_traits_type::pointer; + using const_pointer = typename allocator_traits_type::const_pointer; + + using iterator = concurrent_queue_iterator; + using const_iterator = concurrent_queue_iterator ; + + concurrent_bounded_queue() : concurrent_bounded_queue(allocator_type()) {} + + explicit concurrent_bounded_queue( const allocator_type& a ) : + my_allocator(a), my_capacity(0), my_abort_counter(0), my_queue_representation(nullptr) + { + my_queue_representation = reinterpret_cast( + r1::allocate_bounded_queue_rep(sizeof(queue_representation_type))); + my_monitors = reinterpret_cast(my_queue_representation + 1); + queue_allocator_traits::construct(my_allocator, my_queue_representation); + my_capacity = std::size_t(-1) / (queue_representation_type::item_size > 1 ? queue_representation_type::item_size : 2); + + __TBB_ASSERT(is_aligned(my_queue_representation, max_nfs_size), "alignment error" ); + __TBB_ASSERT(is_aligned(&my_queue_representation->head_counter, max_nfs_size), "alignment error" ); + __TBB_ASSERT(is_aligned(&my_queue_representation->tail_counter, max_nfs_size), "alignment error" ); + __TBB_ASSERT(is_aligned(&my_queue_representation->array, max_nfs_size), "alignment error" ); + } + + template + concurrent_bounded_queue( InputIterator begin, InputIterator end, const allocator_type& a = allocator_type() ) : + concurrent_bounded_queue(a) + { + for (; begin != end; ++begin) + push(*begin); + } + + concurrent_bounded_queue( std::initializer_list init, const allocator_type& alloc = allocator_type() ): + concurrent_bounded_queue(init.begin(), init.end(), alloc) + {} + + concurrent_bounded_queue( const concurrent_bounded_queue& src, const allocator_type& a ) : + concurrent_bounded_queue(a) + { + my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item); + } + + concurrent_bounded_queue( const concurrent_bounded_queue& src ) : + concurrent_bounded_queue(queue_allocator_traits::select_on_container_copy_construction(src.get_allocator())) + { + my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item); + } + + // Move constructors + concurrent_bounded_queue( concurrent_bounded_queue&& src ) : + concurrent_bounded_queue(std::move(src.my_allocator)) + { + internal_swap(src); + } + + concurrent_bounded_queue( concurrent_bounded_queue&& src, const allocator_type& a ) : + concurrent_bounded_queue(a) + { + // checking that memory allocated by one instance of allocator can be deallocated + // with another + if (my_allocator == src.my_allocator) { + internal_swap(src); + } else { + // allocators are different => performing per-element move + my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item); + src.clear(); + } + } + + // Destroy queue + ~concurrent_bounded_queue() { + clear(); + my_queue_representation->clear(my_allocator); + queue_allocator_traits::destroy(my_allocator, my_queue_representation); + r1::deallocate_bounded_queue_rep(reinterpret_cast(my_queue_representation), + sizeof(queue_representation_type)); + } + + concurrent_bounded_queue& operator=( const concurrent_bounded_queue& other ) { + //TODO: implement support for std::allocator_traits::propagate_on_container_copy_assignment + if (my_queue_representation != other.my_queue_representation) { + clear(); + my_allocator = other.my_allocator; + my_queue_representation->assign(*other.my_queue_representation, my_allocator, copy_construct_item); + } + return *this; + } + + concurrent_bounded_queue& operator=( concurrent_bounded_queue&& other ) { + //TODO: implement support for std::allocator_traits::propagate_on_container_move_assignment + if (my_queue_representation != other.my_queue_representation) { + clear(); + if (my_allocator == other.my_allocator) { + internal_swap(other); + } else { + my_queue_representation->assign(*other.my_queue_representation, other.my_allocator, move_construct_item); + other.clear(); + my_allocator = std::move(other.my_allocator); + } + } + return *this; + } + + concurrent_bounded_queue& operator=( std::initializer_list init ) { + assign(init); + return *this; + } + + template + void assign( InputIterator first, InputIterator last ) { + concurrent_bounded_queue src(first, last); + clear(); + my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item); + } + + void assign( std::initializer_list init ) { + assign(init.begin(), init.end()); + } + + void swap ( concurrent_bounded_queue& other ) { + //TODO: implement support for std::allocator_traits::propagate_on_container_swap + __TBB_ASSERT(my_allocator == other.my_allocator, "unequal allocators"); + internal_swap(other); + } + + // Enqueue an item at tail of queue. + void push( const T& value ) { + internal_push(value); + } + + void push( T&& value ) { + internal_push(std::move(value)); + } + + // Enqueue an item at tail of queue if queue is not already full. + // Does not wait for queue to become not full. + // Returns true if item is pushed; false if queue was already full. + bool try_push( const T& value ) { + return internal_push_if_not_full(value); + } + + bool try_push( T&& value ) { + return internal_push_if_not_full(std::move(value)); + } + + template + void emplace( Args&&... args ) { + internal_push(std::forward(args)...); + } + + template + bool try_emplace( Args&&... args ) { + return internal_push_if_not_full(std::forward(args)...); + } + + // Attempt to dequeue an item from head of queue. + void pop( T& result ) { + internal_pop(&result); + } + + /** Does not wait for item to become available. + Returns true if successful; false otherwise. */ + bool try_pop( T& result ) { + return internal_pop_if_present(&result); + } + + void abort() { + internal_abort(); + } + + // Return the number of items in the queue; thread unsafe + std::ptrdiff_t size() const { + return my_queue_representation->size(); + } + + void set_capacity( size_type new_capacity ) { + std::ptrdiff_t c = new_capacity < 0 ? infinite_capacity : new_capacity; + my_capacity = c; + } + + size_type capacity() const { + return my_capacity; + } + + // Equivalent to size()==0. + __TBB_nodiscard bool empty() const { + return my_queue_representation->empty(); + } + + // Clear the queue. not thread-safe. + void clear() { + my_queue_representation->clear(my_allocator); + } + + // Return allocator object + allocator_type get_allocator() const { return my_allocator; } + + //------------------------------------------------------------------------ + // The iterators are intended only for debugging. They are slow and not thread safe. + //------------------------------------------------------------------------ + + iterator unsafe_begin() { return concurrent_queue_iterator_provider::get(*this); } + iterator unsafe_end() { return iterator(); } + const_iterator unsafe_begin() const { return concurrent_queue_iterator_provider::get(*this); } + const_iterator unsafe_end() const { return const_iterator(); } + const_iterator unsafe_cbegin() const { return concurrent_queue_iterator_provider::get(*this); } + const_iterator unsafe_cend() const { return const_iterator(); } + +private: + void internal_swap( concurrent_bounded_queue& src ) { + std::swap(my_queue_representation, src.my_queue_representation); + std::swap(my_monitors, src.my_monitors); + } + + static constexpr std::ptrdiff_t infinite_capacity = std::ptrdiff_t(~size_type(0) / 2); + + template + void internal_push( Args&&... args ) { + unsigned old_abort_counter = my_abort_counter.load(std::memory_order_relaxed); + ticket_type ticket = my_queue_representation->tail_counter++; + std::ptrdiff_t target = ticket - my_capacity; + + if (static_cast(my_queue_representation->head_counter.load(std::memory_order_relaxed)) <= target) { // queue is full + auto pred = [&] { + if (my_abort_counter.load(std::memory_order_relaxed) != old_abort_counter) { + throw_exception(exception_id::user_abort); + } + + return static_cast(my_queue_representation->head_counter.load(std::memory_order_relaxed)) <= target; + }; + + try_call( [&] { + internal_wait(my_monitors, cbq_slots_avail_tag, target, pred); + }).on_exception( [&] { + my_queue_representation->choose(ticket).abort_push(ticket, *my_queue_representation, my_allocator); + }); + + } + __TBB_ASSERT((static_cast(my_queue_representation->head_counter.load(std::memory_order_relaxed)) > target), nullptr); + my_queue_representation->choose(ticket).push(ticket, *my_queue_representation, my_allocator, std::forward(args)...); + r1::notify_bounded_queue_monitor(my_monitors, cbq_items_avail_tag, ticket); + } + + template + bool internal_push_if_not_full( Args&&... args ) { + ticket_type ticket = my_queue_representation->tail_counter.load(std::memory_order_relaxed); + do { + if (static_cast(ticket - my_queue_representation->head_counter.load(std::memory_order_relaxed)) >= my_capacity) { + // Queue is full + return false; + } + // Queue had empty slot with ticket k when we looked. Attempt to claim that slot. + // Another thread claimed the slot, so retry. + } while (!my_queue_representation->tail_counter.compare_exchange_strong(ticket, ticket + 1)); + + my_queue_representation->choose(ticket).push(ticket, *my_queue_representation, my_allocator, std::forward(args)...); + r1::notify_bounded_queue_monitor(my_monitors, cbq_items_avail_tag, ticket); + return true; + } + + void internal_pop( void* dst ) { + std::ptrdiff_t target; + // This loop is a single pop operation; abort_counter should not be re-read inside + unsigned old_abort_counter = my_abort_counter.load(std::memory_order_relaxed); + + do { + target = my_queue_representation->head_counter++; + if (static_cast(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) <= target) { + auto pred = [&] { + if (my_abort_counter.load(std::memory_order_relaxed) != old_abort_counter) { + throw_exception(exception_id::user_abort); + } + + return static_cast(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) <= target; + }; + + try_call( [&] { + internal_wait(my_monitors, cbq_items_avail_tag, target, pred); + }).on_exception( [&] { + my_queue_representation->head_counter--; + }); + } + __TBB_ASSERT(static_cast(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) > target, nullptr); + } while (!my_queue_representation->choose(target).pop(dst, target, *my_queue_representation, my_allocator)); + + r1::notify_bounded_queue_monitor(my_monitors, cbq_slots_avail_tag, target); + } + + bool internal_pop_if_present( void* dst ) { + bool present{}; + ticket_type ticket{}; + std::tie(present, ticket) = internal_try_pop_impl(dst, *my_queue_representation, my_allocator); + + if (present) { + r1::notify_bounded_queue_monitor(my_monitors, cbq_slots_avail_tag, ticket); + } + return present; + } + + void internal_abort() { + ++my_abort_counter; + r1::abort_bounded_queue_monitors(my_monitors); + } + + static void copy_construct_item(T* location, const void* src) { + // TODO: use allocator_traits for copy construction + new (location) value_type(*static_cast(src)); + } + + static void move_construct_item(T* location, const void* src) { + // TODO: use allocator_traits for move construction + new (location) value_type(std::move(*static_cast(const_cast(src)))); + } + + template + friend class concurrent_queue_iterator; + + queue_allocator_type my_allocator; + std::ptrdiff_t my_capacity; + std::atomic my_abort_counter; + queue_representation_type* my_queue_representation; + + r1::concurrent_monitor* my_monitors; + + friend void swap( concurrent_bounded_queue& lhs, concurrent_bounded_queue& rhs ) { + lhs.swap(rhs); + } + + friend bool operator==( const concurrent_bounded_queue& lhs, const concurrent_bounded_queue& rhs ) { + return lhs.size() == rhs.size() && std::equal(lhs.unsafe_begin(), lhs.unsafe_end(), rhs.unsafe_begin()); + } + +#if !__TBB_CPP20_COMPARISONS_PRESENT + friend bool operator!=( const concurrent_bounded_queue& lhs, const concurrent_bounded_queue& rhs ) { + return !(lhs == rhs); + } +#endif // __TBB_CPP20_COMPARISONS_PRESENT +}; // class concurrent_bounded_queue + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT +// Deduction guide for the constructor from two iterators +template >> +concurrent_bounded_queue( It, It, Alloc = Alloc() ) +-> concurrent_bounded_queue, Alloc>; + +#endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */ + +} //namespace d2 +} // namesapce detail + +inline namespace v1 { + +using detail::d2::concurrent_queue; +using detail::d2::concurrent_bounded_queue; +using detail::r1::user_abort; +using detail::r1::bad_last_alloc; + +} // inline namespace v1 +} // namespace tbb + +#endif // __TBB_concurrent_queue_H diff --git a/third_party/tbb/concurrent_set.h b/third_party/tbb/concurrent_set.h new file mode 100644 index 000000000..f1c8babdd --- /dev/null +++ b/third_party/tbb/concurrent_set.h @@ -0,0 +1,268 @@ +// clang-format off +/* + Copyright (c) 2019-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_concurrent_set_H +#define __TBB_concurrent_set_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_concurrent_skip_list.h" +#include "third_party/tbb/tbb_allocator.h" +#include "third_party/libcxx/functional" +#include "third_party/libcxx/utility" + +namespace tbb { +namespace detail { +namespace d2 { + +template +struct set_traits { + static constexpr std::size_t max_level = RandomGenerator::max_level; + using random_level_generator_type = RandomGenerator; + using key_type = Key; + using value_type = key_type; + using compare_type = KeyCompare; + using value_compare = compare_type; + using reference = value_type&; + using const_reference = const value_type&; + using allocator_type = Allocator; + + static constexpr bool allow_multimapping = AllowMultimapping; + + static const key_type& get_key(const_reference val) { + return val; + } + + static value_compare value_comp(compare_type comp) { return comp; } +}; // struct set_traits + +template +class concurrent_multiset; + +template , typename Allocator = tbb::tbb_allocator> +class concurrent_set : public concurrent_skip_list, Allocator, false>> { + using base_type = concurrent_skip_list, Allocator, false>>; +public: + using key_type = Key; + using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; + using difference_type = typename base_type::difference_type; + using key_compare = Compare; + using value_compare = typename base_type::value_compare; + using allocator_type = Allocator; + + using reference = typename base_type::reference; + using const_reference = typename base_type::const_reference; + using pointer = typename base_type::pointer; + using const_pointer = typename base_type::const_pointer; + + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + using node_type = typename base_type::node_type; + + // Include constructors of base_type + using base_type::base_type; + + // Required for implicit deduction guides + concurrent_set() = default; + concurrent_set( const concurrent_set& ) = default; + concurrent_set( const concurrent_set& other, const allocator_type& alloc ) : base_type(other, alloc) {} + concurrent_set( concurrent_set&& ) = default; + concurrent_set( concurrent_set&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} + // Required to respect the rule of 5 + concurrent_set& operator=( const concurrent_set& ) = default; + concurrent_set& operator=( concurrent_set&& ) = default; + + concurrent_set& operator=( std::initializer_list il ) { + base_type::operator= (il); + return *this; + } + + template + void merge(concurrent_set& source) { + this->internal_merge(source); + } + + template + void merge(concurrent_set&& source) { + this->internal_merge(std::move(source)); + } + + template + void merge(concurrent_multiset& source) { + this->internal_merge(source); + } + + template + void merge(concurrent_multiset&& source) { + this->internal_merge(std::move(source)); + } +}; // class concurrent_set + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template >, + typename Alloc = tbb::tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_set( It, It, Comp = Comp(), Alloc = Alloc() ) +-> concurrent_set, Comp, Alloc>; + +template , + typename Alloc = tbb::tbb_allocator, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_set( std::initializer_list, Comp = Comp(), Alloc = Alloc() ) +-> concurrent_set; + +template >, + typename = std::enable_if_t>> +concurrent_set( It, It, Alloc ) +-> concurrent_set, + std::less>, Alloc>; + +template >> +concurrent_set( std::initializer_list, Alloc ) +-> concurrent_set, Alloc>; + +#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template +void swap( concurrent_set& lhs, + concurrent_set& rhs ) +{ + lhs.swap(rhs); +} + +template , typename Allocator = tbb::tbb_allocator> +class concurrent_multiset : public concurrent_skip_list, Allocator, true>> { + using base_type = concurrent_skip_list, Allocator, true>>; +public: + using key_type = Key; + using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; + using difference_type = typename base_type::difference_type; + using key_compare = Compare; + using value_compare = typename base_type::value_compare; + using allocator_type = Allocator; + + using reference = typename base_type::reference; + using const_reference = typename base_type::const_reference; + using pointer = typename base_type::pointer; + using const_pointer = typename base_type::const_pointer; + + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + using node_type = typename base_type::node_type; + + // Include constructors of base_type; + using base_type::base_type; + + // Required for implicit deduction guides + concurrent_multiset() = default; + concurrent_multiset( const concurrent_multiset& ) = default; + concurrent_multiset( const concurrent_multiset& other, const allocator_type& alloc ) : base_type(other, alloc) {} + concurrent_multiset( concurrent_multiset&& ) = default; + concurrent_multiset( concurrent_multiset&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} + // Required to respect the rule of 5 + concurrent_multiset& operator=( const concurrent_multiset& ) = default; + concurrent_multiset& operator=( concurrent_multiset&& ) = default; + + concurrent_multiset& operator=( std::initializer_list il ) { + base_type::operator= (il); + return *this; + } + + template + void merge(concurrent_set& source) { + this->internal_merge(source); + } + + template + void merge(concurrent_set&& source) { + this->internal_merge(std::move(source)); + } + + template + void merge(concurrent_multiset& source) { + this->internal_merge(source); + } + + template + void merge(concurrent_multiset&& source) { + this->internal_merge(std::move(source)); + } +}; // class concurrent_multiset + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template >, + typename Alloc = tbb::tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_multiset( It, It, Comp = Comp(), Alloc = Alloc() ) +-> concurrent_multiset, Comp, Alloc>; + +template , + typename Alloc = tbb::tbb_allocator, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_multiset( std::initializer_list, Comp = Comp(), Alloc = Alloc() ) +-> concurrent_multiset; + +template >, + typename = std::enable_if_t>> +concurrent_multiset( It, It, Alloc ) +-> concurrent_multiset, std::less>, Alloc>; + +template >> +concurrent_multiset( std::initializer_list, Alloc ) +-> concurrent_multiset, Alloc>; + +#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template +void swap( concurrent_multiset& lhs, + concurrent_multiset& rhs ) +{ + lhs.swap(rhs); +} + +} // namespace d2 +} // namespace detail + +inline namespace v1 { + +using detail::d2::concurrent_set; +using detail::d2::concurrent_multiset; +using detail::split; + +} // inline namespace v1 +} // namespace tbb + +#endif // __TBB_concurrent_set_H diff --git a/third_party/tbb/concurrent_unordered_map.h b/third_party/tbb/concurrent_unordered_map.h new file mode 100644 index 000000000..d9fce65d6 --- /dev/null +++ b/third_party/tbb/concurrent_unordered_map.h @@ -0,0 +1,415 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_concurrent_unordered_map_H +#define __TBB_concurrent_unordered_map_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_concurrent_unordered_base.h" +#include "third_party/tbb/tbb_allocator.h" +#include "third_party/libcxx/functional" + +namespace tbb { +namespace detail { +namespace d1 { + +template +struct concurrent_unordered_map_traits { + using value_type = std::pair; + using key_type = Key; + using allocator_type = Allocator; + using hash_compare_type = hash_compare; + static constexpr bool allow_multimapping = AllowMultimapping; + + static constexpr const key_type& get_key( const value_type& value ) { + return value.first; + } +}; // struct concurrent_unordered_map_traits + +template +class concurrent_unordered_multimap; + +template , typename KeyEqual = std::equal_to, + typename Allocator = tbb::tbb_allocator> > +class concurrent_unordered_map + : public concurrent_unordered_base> +{ + using traits_type = concurrent_unordered_map_traits; + using base_type = concurrent_unordered_base; +public: + using key_type = typename base_type::key_type; + using mapped_type = T; + using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; + using difference_type = typename base_type::difference_type; + using hasher = typename base_type::hasher; + using key_equal = typename base_type::key_equal; + using allocator_type = typename base_type::allocator_type; + using reference = typename base_type::reference; + using const_reference = typename base_type::const_reference; + using pointer = typename base_type::pointer; + using const_pointer = typename base_type::const_pointer; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + using local_iterator = typename base_type::local_iterator; + using const_local_iterator = typename base_type::const_local_iterator; + using node_type = typename base_type::node_type; + + // Include constructors of base type + using base_type::base_type; + + // Required for implicit deduction guides + concurrent_unordered_map() = default; + concurrent_unordered_map( const concurrent_unordered_map& ) = default; + concurrent_unordered_map( const concurrent_unordered_map& other, const allocator_type& alloc ) : base_type(other, alloc) {} + concurrent_unordered_map( concurrent_unordered_map&& ) = default; + concurrent_unordered_map( concurrent_unordered_map&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} + // Required to respect the rule of 5 + concurrent_unordered_map& operator=( const concurrent_unordered_map& ) = default; + concurrent_unordered_map& operator=( concurrent_unordered_map&& ) = default; + + concurrent_unordered_map& operator=( std::initializer_list il ) { + base_type::operator= (il); + return *this; + } + + // Observers + mapped_type& operator[]( const key_type& key ) { + iterator where = this->find(key); + + if (where == this->end()) { + where = this->emplace(std::piecewise_construct, std::forward_as_tuple(key), std::tuple<>()).first; + } + return where->second; + } + + mapped_type& operator[]( key_type&& key ) { + iterator where = this->find(key); + + if (where == this->end()) { + where = this->emplace(std::piecewise_construct, std::forward_as_tuple(std::move(key)), std::tuple<>()).first; + } + return where->second; + } + + mapped_type& at( const key_type& key ) { + iterator where = this->find(key); + + if (where == this->end()) { + throw_exception(exception_id::invalid_key); + } + return where->second; + } + + const mapped_type& at( const key_type& key ) const { + const_iterator where = this->find(key); + + if (where == this->end()) { + throw_exception(exception_id::out_of_range); + } + return where->second; + } + + using base_type::insert; + + template + typename std::enable_if::value, + std::pair>::type insert( P&& value ) { + return this->emplace(std::forward

(value)); + } + + template + typename std::enable_if::value, + iterator>::type insert( const_iterator hint, P&& value ) { + return this->emplace_hint(hint, std::forward

(value)); + } + + template + void merge( concurrent_unordered_map& source ) { + this->internal_merge(source); + } + + template + void merge( concurrent_unordered_map&& source ) { + this->internal_merge(std::move(source)); + } + + template + void merge( concurrent_unordered_multimap& source ) { + this->internal_merge(source); + } + + template + void merge( concurrent_unordered_multimap&& source ) { + this->internal_merge(std::move(source)); + } +}; // class concurrent_unordered_map + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT +template >, + typename KeyEq = std::equal_to>, + typename Alloc = tbb::tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_map( It, It, std::size_t = {}, + Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) +-> concurrent_unordered_map, iterator_mapped_t, Hash, KeyEq, Alloc>; + +template >, + typename KeyEq = std::equal_to>, + typename Alloc = tbb::tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_map( std::initializer_list>, std::size_t = {}, + Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) +-> concurrent_unordered_map, T, Hash, KeyEq, Alloc>; + +template >, + typename = std::enable_if_t>> +concurrent_unordered_map( It, It, std::size_t, Alloc ) +-> concurrent_unordered_map, iterator_mapped_t, + std::hash>, + std::equal_to>, Alloc>; + +// TODO: investigate if a deduction guide for concurrent_unordered_map(It, It, Alloc) is needed + +template >, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_map( It, It, std::size_t, Hash, Alloc ) +-> concurrent_unordered_map, iterator_mapped_t, + Hash, std::equal_to>, Alloc>; + +template >> +concurrent_unordered_map( std::initializer_list>, std::size_t, Alloc ) +-> concurrent_unordered_map, T, std::hash>, + std::equal_to>, Alloc>; + +template >> +concurrent_unordered_map( std::initializer_list>, Alloc ) +-> concurrent_unordered_map, T, std::hash>, + std::equal_to>, Alloc>; + +template >, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_map( std::initializer_list>, std::size_t, Hash, Alloc ) +-> concurrent_unordered_map, T, Hash, + std::equal_to>, Alloc>; + +#if __APPLE__ && __TBB_CLANG_VERSION == 100000 +// An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0 +// due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances. +// Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides +// The issue reproduces only on this version of the compiler +template +concurrent_unordered_map( concurrent_unordered_map, Alloc ) +-> concurrent_unordered_map; +#endif + +#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template +void swap( concurrent_unordered_map& lhs, + concurrent_unordered_map& rhs ) { + lhs.swap(rhs); +} + +template , typename KeyEqual = std::equal_to, + typename Allocator = tbb::tbb_allocator> > +class concurrent_unordered_multimap + : public concurrent_unordered_base> +{ + using traits_type = concurrent_unordered_map_traits; + using base_type = concurrent_unordered_base; +public: + using key_type = typename base_type::key_type; + using mapped_type = T; + using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; + using difference_type = typename base_type::difference_type; + using hasher = typename base_type::hasher; + using key_equal = typename base_type::key_equal; + using allocator_type = typename base_type::allocator_type; + using reference = typename base_type::reference; + using const_reference = typename base_type::const_reference; + using pointer = typename base_type::pointer; + using const_pointer = typename base_type::const_pointer; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + using local_iterator = typename base_type::local_iterator; + using const_local_iterator = typename base_type::const_local_iterator; + using node_type = typename base_type::node_type; + + // Include constructors of base type + using base_type::base_type; + using base_type::insert; + + // Required for implicit deduction guides + concurrent_unordered_multimap() = default; + concurrent_unordered_multimap( const concurrent_unordered_multimap& ) = default; + concurrent_unordered_multimap( const concurrent_unordered_multimap& other, const allocator_type& alloc ) : base_type(other, alloc) {} + concurrent_unordered_multimap( concurrent_unordered_multimap&& ) = default; + concurrent_unordered_multimap( concurrent_unordered_multimap&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} + // Required to respect the rule of 5 + concurrent_unordered_multimap& operator=( const concurrent_unordered_multimap& ) = default; + concurrent_unordered_multimap& operator=( concurrent_unordered_multimap&& ) = default; + + concurrent_unordered_multimap& operator=( std::initializer_list il ) { + base_type::operator= (il); + return *this; + } + + template + typename std::enable_if::value, + std::pair>::type insert( P&& value ) { + return this->emplace(std::forward

(value)); + } + + template + typename std::enable_if::value, + iterator>::type insert( const_iterator hint, P&& value ) { + return this->emplace_hint(hint, std::forward(value)); + } + + template + void merge( concurrent_unordered_map& source ) { + this->internal_merge(source); + } + + template + void merge( concurrent_unordered_map&& source ) { + this->internal_merge(std::move(source)); + } + + template + void merge( concurrent_unordered_multimap& source ) { + this->internal_merge(source); + } + + template + void merge( concurrent_unordered_multimap&& source ) { + this->internal_merge(std::move(source)); + } +}; // class concurrent_unordered_multimap + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template >, + typename KeyEq = std::equal_to>, + typename Alloc = tbb::tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_multimap( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) +-> concurrent_unordered_multimap, iterator_mapped_t, Hash, KeyEq, Alloc>; + +template >, + typename KeyEq = std::equal_to>, + typename Alloc = tbb::tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_multimap( std::initializer_list>, std::size_t = {}, + Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) +-> concurrent_unordered_multimap, T, Hash, KeyEq, Alloc>; + +template >, + typename = std::enable_if_t>> +concurrent_unordered_multimap( It, It, std::size_t, Alloc ) +-> concurrent_unordered_multimap, iterator_mapped_t, + std::hash>, + std::equal_to>, Alloc>; + +template >, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_multimap( It, It, std::size_t, Hash, Alloc ) +-> concurrent_unordered_multimap, iterator_mapped_t, Hash, + std::equal_to>, Alloc>; + +template >> +concurrent_unordered_multimap( std::initializer_list>, std::size_t, Alloc ) +-> concurrent_unordered_multimap, T, std::hash>, + std::equal_to>, Alloc>; + +template >> +concurrent_unordered_multimap( std::initializer_list>, Alloc ) +-> concurrent_unordered_multimap, T, std::hash>, + std::equal_to>, Alloc>; + +template >, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_multimap( std::initializer_list>, std::size_t, Hash, Alloc ) +-> concurrent_unordered_multimap, T, Hash, + std::equal_to>, Alloc>; + +#if __APPLE__ && __TBB_CLANG_VERSION == 100000 +// An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0 +// due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances. +// Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides +// The issue reproduces only on this version of the compiler +template +concurrent_unordered_multimap( concurrent_unordered_multimap, Alloc ) +-> concurrent_unordered_multimap; +#endif +#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template +void swap( concurrent_unordered_multimap& lhs, + concurrent_unordered_multimap& rhs ) { + lhs.swap(rhs); +} + +} // namespace d1 +} // namespace detail + +inline namespace v1 { + +using detail::d1::concurrent_unordered_map; +using detail::d1::concurrent_unordered_multimap; +using detail::split; + +} // inline namespace v1 +} // namespace tbb + +#endif // __TBB_concurrent_unordered_map_H diff --git a/third_party/tbb/concurrent_unordered_set.h b/third_party/tbb/concurrent_unordered_set.h new file mode 100644 index 000000000..ff53ac024 --- /dev/null +++ b/third_party/tbb/concurrent_unordered_set.h @@ -0,0 +1,334 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_concurrent_unordered_set_H +#define __TBB_concurrent_unordered_set_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_concurrent_unordered_base.h" +#include "third_party/tbb/tbb_allocator.h" + +namespace tbb { +namespace detail { +namespace d1 { + +template +struct concurrent_unordered_set_traits { + using key_type = Key; + using value_type = key_type; + using allocator_type = Allocator; + using hash_compare_type = hash_compare; + static constexpr bool allow_multimapping = AllowMultimapping; + + static constexpr const key_type& get_key( const value_type& value ) { + return value; + } +}; // class concurrent_unordered_set_traits + +template +class concurrent_unordered_multiset; + +template , typename KeyEqual = std::equal_to, + typename Allocator = tbb::tbb_allocator> +class concurrent_unordered_set + : public concurrent_unordered_base> +{ + using traits_type = concurrent_unordered_set_traits; + using base_type = concurrent_unordered_base; +public: + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; + using difference_type = typename base_type::difference_type; + using hasher = typename base_type::hasher; + using key_equal = typename base_type::key_equal; + using allocator_type = typename base_type::allocator_type; + using reference = typename base_type::reference; + using const_reference = typename base_type::const_reference; + using pointer = typename base_type::pointer; + using const_pointer = typename base_type::const_pointer; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + using local_iterator = typename base_type::local_iterator; + using const_local_iterator = typename base_type::const_local_iterator; + using node_type = typename base_type::node_type; + + // Include constructors of base_type; + using base_type::base_type; + + // Required for implicit deduction guides + concurrent_unordered_set() = default; + concurrent_unordered_set( const concurrent_unordered_set& ) = default; + concurrent_unordered_set( const concurrent_unordered_set& other, const allocator_type& alloc ) : base_type(other, alloc) {} + concurrent_unordered_set( concurrent_unordered_set&& ) = default; + concurrent_unordered_set( concurrent_unordered_set&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} + // Required to respect the rule of 5 + concurrent_unordered_set& operator=( const concurrent_unordered_set& ) = default; + concurrent_unordered_set& operator=( concurrent_unordered_set&& ) = default; + + concurrent_unordered_set& operator=( std::initializer_list il ) { + base_type::operator= (il); + return *this; + } + + template + void merge( concurrent_unordered_set& source ) { + this->internal_merge(source); + } + + template + void merge( concurrent_unordered_set&& source ) { + this->internal_merge(std::move(source)); + } + + template + void merge( concurrent_unordered_multiset& source ) { + this->internal_merge(source); + } + + template + void merge( concurrent_unordered_multiset&& source ) { + this->internal_merge(std::move(source)); + } +}; // class concurrent_unordered_set + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template >, + typename KeyEq = std::equal_to>, + typename Alloc = tbb::tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_set( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) +-> concurrent_unordered_set, Hash, KeyEq, Alloc>; + +template , + typename KeyEq = std::equal_to, + typename Alloc = tbb::tbb_allocator, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_set( std::initializer_list, std::size_t = {}, + Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) +-> concurrent_unordered_set; + +template >, + typename = std::enable_if_t>> +concurrent_unordered_set( It, It, std::size_t, Alloc ) +-> concurrent_unordered_set, std::hash>, + std::equal_to>, Alloc>; + +template >, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_set( It, It, std::size_t, Hash, Alloc ) +-> concurrent_unordered_set, Hash, std::equal_to>, Alloc>; + +template >> +concurrent_unordered_set( std::initializer_list, std::size_t, Alloc ) +-> concurrent_unordered_set, std::equal_to, Alloc>; + +template >> +concurrent_unordered_set( std::initializer_list, Alloc ) +-> concurrent_unordered_set, std::equal_to, Alloc>; + +template >, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_set( std::initializer_list, std::size_t, Hash, Alloc ) +-> concurrent_unordered_set, Alloc>; + +#if __APPLE__ && __TBB_CLANG_VERSION == 100000 +// An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0 +// due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances. +// Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides +// The issue reproduces only on this version of the compiler +template +concurrent_unordered_set( concurrent_unordered_set, Alloc ) +-> concurrent_unordered_set; +#endif +#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template +void swap( concurrent_unordered_set& lhs, + concurrent_unordered_set& rhs ) { + lhs.swap(rhs); +} + +template , typename KeyEqual = std::equal_to, + typename Allocator = tbb::tbb_allocator> +class concurrent_unordered_multiset + : public concurrent_unordered_base> +{ + using traits_type = concurrent_unordered_set_traits; + using base_type = concurrent_unordered_base; +public: + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; + using difference_type = typename base_type::difference_type; + using hasher = typename base_type::hasher; + using key_equal = typename base_type::key_equal; + using allocator_type = typename base_type::allocator_type; + using reference = typename base_type::reference; + using const_reference = typename base_type::const_reference; + using pointer = typename base_type::pointer; + using const_pointer = typename base_type::const_pointer; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + using local_iterator = typename base_type::local_iterator; + using const_local_iterator = typename base_type::const_local_iterator; + using node_type = typename base_type::node_type; + + // Include constructors of base_type; + using base_type::base_type; + + // Required for implicit deduction guides + concurrent_unordered_multiset() = default; + concurrent_unordered_multiset( const concurrent_unordered_multiset& ) = default; + concurrent_unordered_multiset( const concurrent_unordered_multiset& other, const allocator_type& alloc ) : base_type(other, alloc) {} + concurrent_unordered_multiset( concurrent_unordered_multiset&& ) = default; + concurrent_unordered_multiset( concurrent_unordered_multiset&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} + // Required to respect the rule of 5 + concurrent_unordered_multiset& operator=( const concurrent_unordered_multiset& ) = default; + concurrent_unordered_multiset& operator=( concurrent_unordered_multiset&& ) = default; + + concurrent_unordered_multiset& operator=( std::initializer_list il ) { + base_type::operator= (il); + return *this; + } + + template + void merge( concurrent_unordered_set& source ) { + this->internal_merge(source); + } + + template + void merge( concurrent_unordered_set&& source ) { + this->internal_merge(std::move(source)); + } + + template + void merge( concurrent_unordered_multiset& source ) { + this->internal_merge(source); + } + + template + void merge( concurrent_unordered_multiset&& source ) { + this->internal_merge(std::move(source)); + } +}; // class concurrent_unordered_multiset + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT +template >, + typename KeyEq = std::equal_to>, + typename Alloc = tbb::tbb_allocator>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_multiset( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) +-> concurrent_unordered_multiset, Hash, KeyEq, Alloc>; + +template , + typename KeyEq = std::equal_to, + typename Alloc = tbb::tbb_allocator, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_multiset( std::initializer_list, std::size_t = {}, + Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) +-> concurrent_unordered_multiset; + +template >, + typename = std::enable_if_t>> +concurrent_unordered_multiset( It, It, std::size_t, Alloc ) +-> concurrent_unordered_multiset, std::hash>, + std::equal_to>, Alloc>; + +template >, + typename = std::enable_if_t>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_multiset( It, It, std::size_t, Hash, Alloc ) +-> concurrent_unordered_multiset, Hash, std::equal_to>, Alloc>; + +template >> +concurrent_unordered_multiset( std::initializer_list, std::size_t, Alloc ) +-> concurrent_unordered_multiset, std::equal_to, Alloc>; + +template >> +concurrent_unordered_multiset( std::initializer_list, Alloc ) +-> concurrent_unordered_multiset, std::equal_to, Alloc>; + +template >, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_unordered_multiset( std::initializer_list, std::size_t, Hash, Alloc ) +-> concurrent_unordered_multiset, Alloc>; + +#if __APPLE__ && __TBB_CLANG_VERSION == 100000 +// An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0 +// due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances. +// Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides +// The issue reproduces only on this version of the compiler +template +concurrent_unordered_multiset( concurrent_unordered_multiset, Alloc ) +-> concurrent_unordered_multiset; +#endif +#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +template +void swap( concurrent_unordered_multiset& lhs, + concurrent_unordered_multiset& rhs ) { + lhs.swap(rhs); +} + +} // namespace d1 +} // namespace detail + +inline namespace v1 { + +using detail::d1::concurrent_unordered_set; +using detail::d1::concurrent_unordered_multiset; +using detail::split; + +} // inline namespace v1 +} // namespace tbb + +#endif // __TBB_concurrent_unordered_set_H diff --git a/third_party/tbb/concurrent_vector.h b/third_party/tbb/concurrent_vector.h new file mode 100644 index 000000000..012dbe931 --- /dev/null +++ b/third_party/tbb/concurrent_vector.h @@ -0,0 +1,1130 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_concurrent_vector_H +#define __TBB_concurrent_vector_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_allocator_traits.h" +#include "third_party/tbb/detail/_segment_table.h" +#include "third_party/tbb/detail/_containers_helpers.h" +#include "third_party/tbb/blocked_range.h" +#include "third_party/tbb/cache_aligned_allocator.h" + +#include "third_party/libcxx/algorithm" +#include "third_party/libcxx/utility" // std::move_if_noexcept +#include "third_party/libcxx/algorithm" +#if __TBB_CPP20_COMPARISONS_PRESENT +#include "third_party/libcxx/compare" +#endif + +namespace tbb { +namespace detail { +namespace d1 { + +template +class vector_iterator { + using vector_type = Vector; + +public: + using value_type = Value; + using size_type = typename vector_type::size_type; + using difference_type = typename vector_type::difference_type; + using pointer = value_type*; + using reference = value_type&; + using iterator_category = std::random_access_iterator_tag; + + template + friend vector_iterator operator+( typename vector_iterator::difference_type, const vector_iterator& ); + + template + friend typename vector_iterator::difference_type operator-( const vector_iterator&, const vector_iterator& ); + + template + friend bool operator==( const vector_iterator&, const vector_iterator& ); + + template + friend bool operator<( const vector_iterator&, const vector_iterator& ); + + template + friend class vector_iterator; + + template + friend class concurrent_vector; + +private: + vector_iterator( const vector_type& vector, size_type index, value_type* item = nullptr ) + : my_vector(const_cast(&vector)), my_index(index), my_item(item) + {} + +public: + vector_iterator() : my_vector(nullptr), my_index(~size_type(0)), my_item(nullptr) + {} + + vector_iterator( const vector_iterator& other ) + : my_vector(other.my_vector), my_index(other.my_index), my_item(other.my_item) + {} + + vector_iterator& operator=( const vector_iterator& other ) { + my_vector = other.my_vector; + my_index = other.my_index; + my_item = other.my_item; + return *this; + } + + vector_iterator operator+( difference_type offset ) const { + return vector_iterator(*my_vector, my_index + offset); + } + + vector_iterator& operator+=( difference_type offset ) { + my_index += offset; + my_item = nullptr; + return *this; + } + + vector_iterator operator-( difference_type offset ) const { + return vector_iterator(*my_vector, my_index - offset); + } + + vector_iterator& operator-=( difference_type offset ) { + my_index -= offset; + my_item = nullptr; + return *this; + } + + reference operator*() const { + value_type *item = my_item; + if (item == nullptr) { + item = &my_vector->internal_subscript(my_index); + } else { + __TBB_ASSERT(item == &my_vector->internal_subscript(my_index), "corrupt cache"); + } + return *item; + } + + pointer operator->() const { return &(operator*()); } + + reference operator[]( difference_type k ) const { + return my_vector->internal_subscript(my_index + k); + } + + vector_iterator& operator++() { + ++my_index; + if (my_item != nullptr) { + if (vector_type::is_first_element_in_segment(my_index)) { + // If the iterator crosses a segment boundary, the pointer become invalid + // as possibly next segment is in another memory location + my_item = nullptr; + } else { + ++my_item; + } + } + return *this; + } + + vector_iterator operator++(int) { + vector_iterator result = *this; + ++(*this); + return result; + } + + vector_iterator& operator--() { + __TBB_ASSERT(my_index > 0, "operator--() applied to iterator already at beginning of concurrent_vector"); + --my_index; + if (my_item != nullptr) { + if (vector_type::is_first_element_in_segment(my_index)) { + // If the iterator crosses a segment boundary, the pointer become invalid + // as possibly next segment is in another memory location + my_item = nullptr; + } else { + --my_item; + } + } + return *this; + } + + vector_iterator operator--(int) { + vector_iterator result = *this; + --(*this); + return result; + } + +private: + // concurrent_vector over which we are iterating. + vector_type* my_vector; + + // Index into the vector + size_type my_index; + + // Caches my_vector *it; + // If my_item == nullptr cached value is not available use internal_subscript(my_index) + mutable value_type* my_item; +}; // class vector_iterator + +template +vector_iterator operator+( typename vector_iterator::difference_type offset, + const vector_iterator& v ) +{ + return vector_iterator(*v.my_vector, v.my_index + offset); +} + +template +typename vector_iterator::difference_type operator-( const vector_iterator& i, + const vector_iterator& j ) +{ + using difference_type = typename vector_iterator::difference_type; + return static_cast(i.my_index) - static_cast(j.my_index); +} + +template +bool operator==( const vector_iterator& i, const vector_iterator& j ) { + return i.my_vector == j.my_vector && i.my_index == j.my_index; +} + +template +bool operator!=( const vector_iterator& i, const vector_iterator& j ) { + return !(i == j); +} + +template +bool operator<( const vector_iterator& i, const vector_iterator& j ) { + return i.my_index < j.my_index; +} + +template +bool operator>( const vector_iterator& i, const vector_iterator& j ) { + return j < i; +} + +template +bool operator>=( const vector_iterator& i, const vector_iterator& j ) { + return !(i < j); +} + +template +bool operator<=( const vector_iterator& i, const vector_iterator& j ) { + return !(j < i); +} + +static constexpr std::size_t embedded_table_num_segments = 3; + +template > +class concurrent_vector + : private segment_table, embedded_table_num_segments> +{ + using self_type = concurrent_vector; + using base_type = segment_table; + + friend class segment_table; + + template + class generic_range_type : public tbb::blocked_range { + using base_type = tbb::blocked_range; + public: + using value_type = T; + using reference = T&; + using const_reference = const T&; + using iterator = Iterator; + using difference_type = std::ptrdiff_t; + + using base_type::base_type; + + template + generic_range_type( const generic_range_type& r) : blocked_range(r.begin(), r.end(), r.grainsize()) {} + generic_range_type( generic_range_type& r, split ) : blocked_range(r, split()) {} + }; // class generic_range_type + + static_assert(std::is_same::value, + "value_type of the container must be the same as its allocator's"); + using allocator_traits_type = tbb::detail::allocator_traits; + // Segment table for concurrent_vector can be extended + static constexpr bool allow_table_extending = true; + static constexpr bool is_noexcept_assignment = allocator_traits_type::propagate_on_container_move_assignment::value || + allocator_traits_type::is_always_equal::value; + static constexpr bool is_noexcept_swap = allocator_traits_type::propagate_on_container_swap::value || + allocator_traits_type::is_always_equal::value; + +public: + using value_type = T; + using allocator_type = Allocator; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using reference = value_type&; + using const_reference = const value_type&; + + using pointer = typename allocator_traits_type::pointer; + using const_pointer = typename allocator_traits_type::const_pointer; + + using iterator = vector_iterator; + using const_iterator = vector_iterator; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + + using range_type = generic_range_type; + using const_range_type = generic_range_type; + + concurrent_vector() : concurrent_vector(allocator_type()) {} + + explicit concurrent_vector( const allocator_type& alloc ) noexcept + : base_type(alloc) + {} + + explicit concurrent_vector( size_type count, const value_type& value, + const allocator_type& alloc = allocator_type() ) + : concurrent_vector(alloc) + { + try_call( [&] { + grow_by(count, value); + } ).on_exception( [&] { + base_type::clear(); + }); + } + + explicit concurrent_vector( size_type count, const allocator_type& alloc = allocator_type() ) + : concurrent_vector(alloc) + { + try_call( [&] { + grow_by(count); + } ).on_exception( [&] { + base_type::clear(); + }); + } + + template + concurrent_vector( InputIterator first, InputIterator last, const allocator_type& alloc = allocator_type() ) + : concurrent_vector(alloc) + { + try_call( [&] { + grow_by(first, last); + } ).on_exception( [&] { + base_type::clear(); + }); + } + + concurrent_vector( const concurrent_vector& other ) + : base_type(segment_table_allocator_traits::select_on_container_copy_construction(other.get_allocator())) + { + try_call( [&] { + grow_by(other.begin(), other.end()); + } ).on_exception( [&] { + base_type::clear(); + }); + } + + concurrent_vector( const concurrent_vector& other, const allocator_type& alloc ) + : base_type(other, alloc) {} + + concurrent_vector(concurrent_vector&& other) noexcept + : base_type(std::move(other)) + {} + + concurrent_vector( concurrent_vector&& other, const allocator_type& alloc ) + : base_type(std::move(other), alloc) + {} + + concurrent_vector( std::initializer_list init, + const allocator_type& alloc = allocator_type() ) + : concurrent_vector(init.begin(), init.end(), alloc) + {} + + ~concurrent_vector() {} + + // Assignment + concurrent_vector& operator=( const concurrent_vector& other ) { + base_type::operator=(other); + return *this; + } + + concurrent_vector& operator=( concurrent_vector&& other ) noexcept(is_noexcept_assignment) { + base_type::operator=(std::move(other)); + return *this; + } + + concurrent_vector& operator=( std::initializer_list init ) { + assign(init); + return *this; + } + + void assign( size_type count, const value_type& value ) { + destroy_elements(); + grow_by(count, value); + } + + template + typename std::enable_if::value, void>::type + assign( InputIterator first, InputIterator last ) { + destroy_elements(); + grow_by(first, last); + } + + void assign( std::initializer_list init ) { + destroy_elements(); + assign(init.begin(), init.end()); + } + + // Concurrent growth + iterator grow_by( size_type delta ) { + return internal_grow_by_delta(delta); + } + + iterator grow_by( size_type delta, const value_type& value ) { + return internal_grow_by_delta(delta, value); + } + + template + typename std::enable_if::value, iterator>::type + grow_by( ForwardIterator first, ForwardIterator last ) { + auto delta = std::distance(first, last); + return internal_grow_by_delta(delta, first, last); + } + + iterator grow_by( std::initializer_list init ) { + return grow_by(init.begin(), init.end()); + } + + iterator grow_to_at_least( size_type n ) { + return internal_grow_to_at_least(n); + } + iterator grow_to_at_least( size_type n, const value_type& value ) { + return internal_grow_to_at_least(n, value); + } + + iterator push_back( const value_type& item ) { + return internal_emplace_back(item); + } + + iterator push_back( value_type&& item ) { + return internal_emplace_back(std::move(item)); + } + + template + iterator emplace_back( Args&&... args ) { + return internal_emplace_back(std::forward(args)...); + } + + // Items access + reference operator[]( size_type index ) { + return internal_subscript(index); + } + const_reference operator[]( size_type index ) const { + return internal_subscript(index); + } + + reference at( size_type index ) { + return internal_subscript_with_exceptions(index); + } + const_reference at( size_type index ) const { + return internal_subscript_with_exceptions(index); + } + + // Get range for iterating with parallel algorithms + range_type range( size_t grainsize = 1 ) { + return range_type(begin(), end(), grainsize); + } + + // Get const range for iterating with parallel algorithms + const_range_type range( size_t grainsize = 1 ) const { + return const_range_type(begin(), end(), grainsize); + } + + reference front() { + return internal_subscript(0); + } + + const_reference front() const { + return internal_subscript(0); + } + + reference back() { + return internal_subscript(size() - 1); + } + + const_reference back() const { + return internal_subscript(size() - 1); + } + + // Iterators + iterator begin() { return iterator(*this, 0); } + const_iterator begin() const { return const_iterator(*this, 0); } + const_iterator cbegin() const { return const_iterator(*this, 0); } + + iterator end() { return iterator(*this, size()); } + const_iterator end() const { return const_iterator(*this, size()); } + const_iterator cend() const { return const_iterator(*this, size()); } + + reverse_iterator rbegin() { return reverse_iterator(end()); } + const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); } + const_reverse_iterator crbegin() const { return const_reverse_iterator(cend()); } + + reverse_iterator rend() { return reverse_iterator(begin()); } + const_reverse_iterator rend() const { return const_reverse_iterator(begin()); } + const_reverse_iterator crend() const { return const_reverse_iterator(cbegin()); } + + allocator_type get_allocator() const { + return base_type::get_allocator(); + } + + // Storage + bool empty() const noexcept { + return 0 == size(); + } + + size_type size() const noexcept { + return std::min(this->my_size.load(std::memory_order_acquire), capacity()); + } + + size_type max_size() const noexcept { + return allocator_traits_type::max_size(base_type::get_allocator()); + } + + size_type capacity() const noexcept { + return base_type::capacity(); + } + + void reserve( size_type n ) { + if (n == 0) return; + + if (n > max_size()) { + tbb::detail::throw_exception(exception_id::reservation_length_error); + } + + this->assign_first_block_if_necessary(this->segment_index_of(n - 1) + 1); + base_type::reserve(n); + } + + void resize( size_type n ) { + internal_resize(n); + } + + void resize( size_type n, const value_type& val ) { + internal_resize(n, val); + } + + void shrink_to_fit() { + internal_compact(); + } + + void swap(concurrent_vector& other) noexcept(is_noexcept_swap) { + base_type::swap(other); + } + + void clear() { + destroy_elements(); + } + +private: + using segment_type = typename base_type::segment_type; + using segment_table_type = typename base_type::segment_table_type; + using segment_table_allocator_traits = typename base_type::segment_table_allocator_traits; + using segment_index_type = typename base_type::segment_index_type; + + using segment_element_type = typename base_type::value_type; + using segment_element_allocator_type = typename allocator_traits_type::template rebind_alloc; + using segment_element_allocator_traits = tbb::detail::allocator_traits; + + segment_table_type allocate_long_table( const typename base_type::atomic_segment* embedded_table, size_type start_index ) { + __TBB_ASSERT(start_index <= this->embedded_table_size, "Start index out of embedded table"); + + // If other threads are trying to set pointers in the short segment, wait for them to finish their + // assignments before we copy the short segment to the long segment. Note: grow_to_at_least depends on it + for (segment_index_type i = 0; this->segment_base(i) < start_index; ++i) { + spin_wait_while_eq(embedded_table[i], segment_type(nullptr)); + } + + // It is possible that the table was extend by a thread allocating first_block, need to check this. + if (this->get_table() != embedded_table) { + return nullptr; + } + + // Allocate long segment table and fill with null pointers + segment_table_type new_segment_table = segment_table_allocator_traits::allocate(base_type::get_allocator(), this->pointers_per_long_table); + // Copy segment pointers from the embedded table + for (size_type segment_index = 0; segment_index < this->pointers_per_embedded_table; ++segment_index) { + segment_table_allocator_traits::construct(base_type::get_allocator(), &new_segment_table[segment_index], + embedded_table[segment_index].load(std::memory_order_relaxed)); + } + for (size_type segment_index = this->pointers_per_embedded_table; segment_index < this->pointers_per_long_table; ++segment_index) { + segment_table_allocator_traits::construct(base_type::get_allocator(), &new_segment_table[segment_index], nullptr); + } + + return new_segment_table; + } + + // create_segment function is required by the segment_table base class + segment_type create_segment( segment_table_type table, segment_index_type seg_index, size_type index ) { + size_type first_block = this->my_first_block.load(std::memory_order_relaxed); + // First block allocation + if (seg_index < first_block) { + // If 0 segment is already allocated, then it remains to wait until the segments are filled to requested + if (table[0].load(std::memory_order_acquire) != nullptr) { + spin_wait_while_eq(table[seg_index], segment_type(nullptr)); + return nullptr; + } + + segment_element_allocator_type segment_allocator(base_type::get_allocator()); + segment_type new_segment = nullptr; + size_type first_block_size = this->segment_size(first_block); + try_call( [&] { + new_segment = segment_element_allocator_traits::allocate(segment_allocator, first_block_size); + } ).on_exception( [&] { + segment_type disabled_segment = nullptr; + if (table[0].compare_exchange_strong(disabled_segment, this->segment_allocation_failure_tag)) { + size_type end_segment = table == this->my_embedded_table ? this->pointers_per_embedded_table : first_block; + for (size_type i = 1; i < end_segment; ++i) { + table[i].store(this->segment_allocation_failure_tag, std::memory_order_release); + } + } + }); + + segment_type disabled_segment = nullptr; + if (table[0].compare_exchange_strong(disabled_segment, new_segment)) { + this->extend_table_if_necessary(table, 0, first_block_size); + for (size_type i = 1; i < first_block; ++i) { + table[i].store(new_segment, std::memory_order_release); + } + + // Other threads can wait on a snapshot of an embedded table, need to fill it. + for (size_type i = 1; i < first_block && i < this->pointers_per_embedded_table; ++i) { + this->my_embedded_table[i].store(new_segment, std::memory_order_release); + } + } else if (new_segment != this->segment_allocation_failure_tag) { + // Deallocate the memory + segment_element_allocator_traits::deallocate(segment_allocator, new_segment, first_block_size); + // 0 segment is already allocated, then it remains to wait until the segments are filled to requested + spin_wait_while_eq(table[seg_index], segment_type(nullptr)); + } + } else { + size_type offset = this->segment_base(seg_index); + if (index == offset) { + __TBB_ASSERT(table[seg_index].load(std::memory_order_relaxed) == nullptr, "Only this thread can enable this segment"); + segment_element_allocator_type segment_allocator(base_type::get_allocator()); + segment_type new_segment = this->segment_allocation_failure_tag; + try_call( [&] { + new_segment = segment_element_allocator_traits::allocate(segment_allocator,this->segment_size(seg_index)); + // Shift base address to simplify access by index + new_segment -= this->segment_base(seg_index); + } ).on_completion( [&] { + table[seg_index].store(new_segment, std::memory_order_release); + }); + } else { + spin_wait_while_eq(table[seg_index], segment_type(nullptr)); + } + } + return nullptr; + } + + // Returns the number of elements in the segment to be destroy + size_type number_of_elements_in_segment( segment_index_type seg_index ) { + size_type curr_vector_size = this->my_size.load(std::memory_order_relaxed); + size_type curr_segment_base = this->segment_base(seg_index); + + if (seg_index == 0) { + return std::min(curr_vector_size, this->segment_size(seg_index)); + } else { + // Perhaps the segment is allocated, but there are no elements in it. + if (curr_vector_size < curr_segment_base) { + return 0; + } + return curr_segment_base * 2 > curr_vector_size ? curr_vector_size - curr_segment_base : curr_segment_base; + } + } + + segment_type nullify_segment( segment_table_type table, size_type segment_index ) { + segment_type target_segment = table[segment_index].load(std::memory_order_relaxed); + if (segment_index >= this->my_first_block) { + table[segment_index].store(nullptr, std::memory_order_relaxed); + } else { + if (segment_index == 0) { + for (size_type i = 0; i < this->my_first_block; ++i) { + table[i].store(nullptr, std::memory_order_relaxed); + } + } + } + + return target_segment; + } + + void deallocate_segment( segment_type address, segment_index_type seg_index ) { + segment_element_allocator_type segment_allocator(base_type::get_allocator()); + size_type first_block = this->my_first_block.load(std::memory_order_relaxed); + if (seg_index >= first_block) { + segment_element_allocator_traits::deallocate(segment_allocator, address, this->segment_size(seg_index)); + } + else if (seg_index == 0) { + size_type elements_to_deallocate = first_block > 0 ? this->segment_size(first_block) : this->segment_size(0); + segment_element_allocator_traits::deallocate(segment_allocator, address, elements_to_deallocate); + } + } + + // destroy_segment function is required by the segment_table base class + void destroy_segment( segment_type address, segment_index_type seg_index ) { + size_type elements_to_destroy = number_of_elements_in_segment(seg_index); + segment_element_allocator_type segment_allocator(base_type::get_allocator()); + + for (size_type i = 0; i < elements_to_destroy; ++i) { + segment_element_allocator_traits::destroy(segment_allocator, address + i); + } + + deallocate_segment(address, seg_index); + } + + // copy_segment function is required by the segment_table base class + void copy_segment( segment_index_type seg_index, segment_type from, segment_type to ) { + size_type i = 0; + try_call( [&] { + for (; i != number_of_elements_in_segment(seg_index); ++i) { + segment_table_allocator_traits::construct(base_type::get_allocator(), to + i, from[i]); + } + } ).on_exception( [&] { + // Zero-initialize items left not constructed after the exception + zero_unconstructed_elements(this->get_segment(seg_index) + i, this->segment_size(seg_index) - i); + + segment_index_type last_segment = this->segment_index_of(this->my_size.load(std::memory_order_relaxed)); + auto table = this->get_table(); + for (segment_index_type j = seg_index + 1; j != last_segment; ++j) { + auto curr_segment = table[j].load(std::memory_order_relaxed); + if (curr_segment) { + zero_unconstructed_elements(curr_segment + this->segment_base(j), this->segment_size(j)); + } + } + this->my_size.store(this->segment_size(seg_index) + i, std::memory_order_relaxed); + }); + } + + // move_segment function is required by the segment_table base class + void move_segment( segment_index_type seg_index, segment_type from, segment_type to ) { + size_type i = 0; + try_call( [&] { + for (; i != number_of_elements_in_segment(seg_index); ++i) { + segment_table_allocator_traits::construct(base_type::get_allocator(), to + i, std::move(from[i])); + } + } ).on_exception( [&] { + // Zero-initialize items left not constructed after the exception + zero_unconstructed_elements(this->get_segment(seg_index) + i, this->segment_size(seg_index) - i); + + segment_index_type last_segment = this->segment_index_of(this->my_size.load(std::memory_order_relaxed)); + auto table = this->get_table(); + for (segment_index_type j = seg_index + 1; j != last_segment; ++j) { + auto curr_segment = table[j].load(std::memory_order_relaxed); + if (curr_segment) { + zero_unconstructed_elements(curr_segment + this->segment_base(j), this->segment_size(j)); + } + } + this->my_size.store(this->segment_size(seg_index) + i, std::memory_order_relaxed); + }); + } + + static constexpr bool is_first_element_in_segment( size_type index ) { + // An element is the first in a segment if its index is equal to a power of two + return is_power_of_two_at_least(index, 2); + } + + const_reference internal_subscript( size_type index ) const { + return const_cast(this)->internal_subscript(index); + } + + reference internal_subscript( size_type index ) { + __TBB_ASSERT(index < this->my_size.load(std::memory_order_relaxed), "Invalid subscript index"); + return base_type::template internal_subscript(index); + } + + const_reference internal_subscript_with_exceptions( size_type index ) const { + return const_cast(this)->internal_subscript_with_exceptions(index); + } + + reference internal_subscript_with_exceptions( size_type index ) { + if (index >= this->my_size.load(std::memory_order_acquire)) { + tbb::detail::throw_exception(exception_id::out_of_range); + } + + segment_table_type table = this->my_segment_table.load(std::memory_order_acquire); + + size_type seg_index = this->segment_index_of(index); + if (base_type::number_of_segments(table) < seg_index) { + tbb::detail::throw_exception(exception_id::out_of_range); + } + + if (table[seg_index] <= this->segment_allocation_failure_tag) { + tbb::detail::throw_exception(exception_id::out_of_range); + } + + return base_type::template internal_subscript(index); + } + + static void zero_unconstructed_elements( pointer start, size_type count ) { + std::memset(static_cast(start), 0, count * sizeof(value_type)); + } + + template + iterator internal_emplace_back( Args&&... args ) { + size_type old_size = this->my_size++; + this->assign_first_block_if_necessary(default_first_block_size); + auto element_address = &base_type::template internal_subscript(old_size); + + // try_call API is not convenient here due to broken + // variadic capture on GCC 4.8.5 + auto value_guard = make_raii_guard([&] { + zero_unconstructed_elements(element_address, /*count =*/1); + }); + + segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, std::forward(args)...); + value_guard.dismiss(); + return iterator(*this, old_size, element_address); + } + + template + void internal_loop_construct( segment_table_type table, size_type start_idx, size_type end_idx, const Args&... args ) { + static_assert(sizeof...(Args) < 2, "Too many parameters"); + for (size_type idx = start_idx; idx < end_idx; ++idx) { + auto element_address = &base_type::template internal_subscript(idx); + // try_call API is not convenient here due to broken + // variadic capture on GCC 4.8.5 + auto value_guard = make_raii_guard( [&] { + segment_index_type last_allocated_segment = this->find_last_allocated_segment(table); + size_type segment_size = this->segment_size(last_allocated_segment); + end_idx = end_idx < segment_size ? end_idx : segment_size; + for (size_type i = idx; i < end_idx; ++i) { + zero_unconstructed_elements(&this->internal_subscript(i), /*count =*/1); + } + }); + segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, args...); + value_guard.dismiss(); + } + } + + template + void internal_loop_construct( segment_table_type table, size_type start_idx, size_type end_idx, ForwardIterator first, ForwardIterator ) { + for (size_type idx = start_idx; idx < end_idx; ++idx) { + auto element_address = &base_type::template internal_subscript(idx); + try_call( [&] { + segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, *first++); + } ).on_exception( [&] { + segment_index_type last_allocated_segment = this->find_last_allocated_segment(table); + size_type segment_size = this->segment_size(last_allocated_segment); + end_idx = end_idx < segment_size ? end_idx : segment_size; + for (size_type i = idx; i < end_idx; ++i) { + zero_unconstructed_elements(&this->internal_subscript(i), /*count =*/1); + } + }); + } + } + + template + iterator internal_grow( size_type start_idx, size_type end_idx, const Args&... args ) { + this->assign_first_block_if_necessary(this->segment_index_of(end_idx - 1) + 1); + size_type seg_index = this->segment_index_of(end_idx - 1); + segment_table_type table = this->get_table(); + this->extend_table_if_necessary(table, start_idx, end_idx); + + if (seg_index > this->my_first_block.load(std::memory_order_relaxed)) { + // So that other threads be able to work with the last segment of grow_by, allocate it immediately. + // If the last segment is not less than the first block + if (table[seg_index].load(std::memory_order_relaxed) == nullptr) { + size_type first_element = this->segment_base(seg_index); + if (first_element >= start_idx && first_element < end_idx) { + segment_type segment = table[seg_index].load(std::memory_order_relaxed); + base_type::enable_segment(segment, table, seg_index, first_element); + } + } + } + + internal_loop_construct(table, start_idx, end_idx, args...); + + return iterator(*this, start_idx, &base_type::template internal_subscript(start_idx)); + } + + + template + iterator internal_grow_by_delta( size_type delta, const Args&... args ) { + if (delta == size_type(0)) { + return end(); + } + size_type start_idx = this->my_size.fetch_add(delta); + size_type end_idx = start_idx + delta; + + return internal_grow(start_idx, end_idx, args...); + } + + template + iterator internal_grow_to_at_least( size_type new_size, const Args&... args ) { + size_type old_size = this->my_size.load(std::memory_order_relaxed); + if (new_size == size_type(0)) return iterator(*this, 0); + while (old_size < new_size && !this->my_size.compare_exchange_weak(old_size, new_size)) + {} + + int delta = static_cast(new_size) - static_cast(old_size); + if (delta > 0) { + return internal_grow(old_size, new_size, args...); + } + + size_type end_segment = this->segment_index_of(new_size - 1); + + // Check/wait for segments allocation completes + if (end_segment >= this->pointers_per_embedded_table && + this->get_table() == this->my_embedded_table) + { + spin_wait_while_eq(this->my_segment_table, this->my_embedded_table); + } + + for (segment_index_type seg_idx = 0; seg_idx <= end_segment; ++seg_idx) { + if (this->get_table()[seg_idx].load(std::memory_order_relaxed) == nullptr) { + atomic_backoff backoff(true); + while (this->get_table()[seg_idx].load(std::memory_order_relaxed) == nullptr) { + backoff.pause(); + } + } + } + + #if TBB_USE_DEBUG + size_type cap = capacity(); + __TBB_ASSERT( cap >= new_size, nullptr); + #endif + return iterator(*this, size()); + } + + template + void internal_resize( size_type n, const Args&... args ) { + if (n == 0) { + clear(); + return; + } + + size_type old_size = this->my_size.load(std::memory_order_acquire); + if (n > old_size) { + reserve(n); + grow_to_at_least(n, args...); + } else { + if (old_size == n) { + return; + } + size_type last_segment = this->segment_index_of(old_size - 1); + // Delete segments + for (size_type seg_idx = this->segment_index_of(n - 1) + 1; seg_idx <= last_segment; ++seg_idx) { + this->delete_segment(seg_idx); + } + + // If n > segment_size(n) => we need to destroy all of the items in the first segment + // Otherwise, we need to destroy only items with the index < n + size_type n_segment = this->segment_index_of(n - 1); + size_type last_index_to_destroy = std::min(this->segment_base(n_segment) + this->segment_size(n_segment), old_size); + // Destroy elements in curr segment + for (size_type idx = n; idx < last_index_to_destroy; ++idx) { + segment_table_allocator_traits::destroy(base_type::get_allocator(), &base_type::template internal_subscript(idx)); + } + this->my_size.store(n, std::memory_order_release); + } + } + + void destroy_elements() { + allocator_type alloc(base_type::get_allocator()); + for (size_type i = 0; i < this->my_size.load(std::memory_order_relaxed); ++i) { + allocator_traits_type::destroy(alloc, &base_type::template internal_subscript(i)); + } + this->my_size.store(0, std::memory_order_relaxed); + } + + static bool incompact_predicate( size_type size ) { + // memory page size + const size_type page_size = 4096; + return size < page_size || ((size - 1) % page_size < page_size / 2 && size < page_size * 128); + } + + void internal_compact() { + const size_type curr_size = this->my_size.load(std::memory_order_relaxed); + segment_table_type table = this->get_table(); + const segment_index_type k_end = this->find_last_allocated_segment(table); // allocated segments + const segment_index_type k_stop = curr_size ? this->segment_index_of(curr_size - 1) + 1 : 0; // number of segments to store existing items: 0=>0; 1,2=>1; 3,4=>2; [5-8]=>3;.. + const segment_index_type first_block = this->my_first_block; // number of merged segments, getting values from atomics + + segment_index_type k = first_block; + if (k_stop < first_block) { + k = k_stop; + } + else { + while (k < k_stop && incompact_predicate(this->segment_size(k) * sizeof(value_type))) k++; + } + + if (k_stop == k_end && k == first_block) { + return; + } + + // First segment optimization + if (k != first_block && k) { + size_type max_block = std::max(first_block, k); + + auto buffer_table = segment_table_allocator_traits::allocate(base_type::get_allocator(), max_block); + + for (size_type seg_idx = 0; seg_idx < max_block; ++seg_idx) { + segment_table_allocator_traits::construct(base_type::get_allocator(), &buffer_table[seg_idx], + table[seg_idx].load(std::memory_order_relaxed)); + table[seg_idx].store(nullptr, std::memory_order_relaxed); + } + + this->my_first_block.store(k, std::memory_order_relaxed); + size_type index = 0; + try_call( [&] { + for (; index < std::min(this->segment_size(max_block), curr_size); ++index) { + auto element_address = &static_cast(this)->operator[](index); + segment_index_type seg_idx = this->segment_index_of(index); + segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, + std::move_if_noexcept(buffer_table[seg_idx].load(std::memory_order_relaxed)[index])); + } + } ).on_exception( [&] { + segment_element_allocator_type allocator(base_type::get_allocator()); + for (size_type i = 0; i < index; ++i) { + auto element_adress = &this->operator[](i); + segment_element_allocator_traits::destroy(allocator, element_adress); + } + segment_element_allocator_traits::deallocate(allocator, + table[0].load(std::memory_order_relaxed), this->segment_size(max_block)); + + for (size_type seg_idx = 0; seg_idx < max_block; ++seg_idx) { + table[seg_idx].store(buffer_table[seg_idx].load(std::memory_order_relaxed), + std::memory_order_relaxed); + buffer_table[seg_idx].store(nullptr, std::memory_order_relaxed); + } + segment_table_allocator_traits::deallocate(base_type::get_allocator(), + buffer_table, max_block); + this->my_first_block.store(first_block, std::memory_order_relaxed); + }); + + // Need to correct deallocate old segments + // Method destroy_segment respect active first_block, therefore, + // in order for the segment deletion to work correctly, set the first_block size that was earlier, + // destroy the unnecessary segments. + this->my_first_block.store(first_block, std::memory_order_relaxed); + for (size_type seg_idx = max_block; seg_idx > 0 ; --seg_idx) { + auto curr_segment = buffer_table[seg_idx - 1].load(std::memory_order_relaxed); + if (curr_segment != nullptr) { + destroy_segment(buffer_table[seg_idx - 1].load(std::memory_order_relaxed) + this->segment_base(seg_idx - 1), + seg_idx - 1); + } + } + + this->my_first_block.store(k, std::memory_order_relaxed); + + for (size_type seg_idx = 0; seg_idx < max_block; ++seg_idx) { + segment_table_allocator_traits::destroy(base_type::get_allocator(), &buffer_table[seg_idx]); + } + + segment_table_allocator_traits::deallocate(base_type::get_allocator(), buffer_table, max_block); + } + // free unnecessary segments allocated by reserve() call + if (k_stop < k_end) { + for (size_type seg_idx = k_end; seg_idx != k_stop; --seg_idx) { + if (table[seg_idx - 1].load(std::memory_order_relaxed) != nullptr) { + this->delete_segment(seg_idx - 1); + } + } + if (!k) this->my_first_block.store(0, std::memory_order_relaxed); + } + } + + // Lever for adjusting the size of first_block at the very first insertion. + // TODO: consider >1 value, check performance + static constexpr size_type default_first_block_size = 1; + + template + friend class vector_iterator; +}; // class concurrent_vector + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT +// Deduction guide for the constructor from two iterators +template >, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +concurrent_vector( It, It, Alloc = Alloc() ) +-> concurrent_vector, Alloc>; +#endif + +template +void swap(concurrent_vector &lhs, + concurrent_vector &rhs) +{ + lhs.swap(rhs); +} + +template +bool operator==(const concurrent_vector &lhs, + const concurrent_vector &rhs) +{ + return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin()); +} + +#if !__TBB_CPP20_COMPARISONS_PRESENT +template +bool operator!=(const concurrent_vector &lhs, + const concurrent_vector &rhs) +{ + return !(lhs == rhs); +} +#endif // !__TBB_CPP20_COMPARISONS_PRESENT + +#if __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT +template +tbb::detail::synthesized_three_way_result::value_type> +operator<=>(const concurrent_vector &lhs, + const concurrent_vector &rhs) +{ + return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(), + rhs.begin(), rhs.end(), + tbb::detail::synthesized_three_way_comparator{}); +} + +#else + +template +bool operator<(const concurrent_vector &lhs, + const concurrent_vector &rhs) +{ + return std::lexicographical_compare(lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); +} + +template +bool operator<=(const concurrent_vector &lhs, + const concurrent_vector &rhs) +{ + return !(rhs < lhs); +} + +template +bool operator>(const concurrent_vector &lhs, + const concurrent_vector &rhs) +{ + return rhs < lhs; +} + +template +bool operator>=(const concurrent_vector &lhs, + const concurrent_vector &rhs) +{ + return !(lhs < rhs); +} +#endif // __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT + +} // namespace d1 +} // namespace detail + +inline namespace v1 { + using detail::d1::concurrent_vector; +} // namespace v1 + +} // namespace tbb + +#endif // __TBB_concurrent_vector_H diff --git a/third_party/tbb/detail/_aggregator.h b/third_party/tbb/detail/_aggregator.h new file mode 100644 index 000000000..bc263885c --- /dev/null +++ b/third_party/tbb/detail/_aggregator.h @@ -0,0 +1,177 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + + +#ifndef __TBB_detail__aggregator_H +#define __TBB_detail__aggregator_H + +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/libcxx/atomic" +#if !__TBBMALLOC_BUILD // TODO: check this macro with TBB Malloc +#include "third_party/tbb/profiling.h" +#endif + +namespace tbb { +namespace detail { +namespace d1 { + +// Base class for aggregated operation +template +class aggregated_operation { +public: + // Zero value means "wait" status, all other values are "user" specified values and + // are defined into the scope of a class which uses "status" + std::atomic status; + + std::atomic next; + aggregated_operation() : status{}, next(nullptr) {} +}; // class aggregated_operation + +// Aggregator base class +/* An aggregator for collecting operations coming from multiple sources and executing + them serially on a single thread. OperationType must be derived from + aggregated_operation. The parameter HandlerType is a functor that will be passed the + list of operations and is expected to handle each operation appropriately, setting the + status of each operation to non-zero. */ +template +class aggregator_generic { +public: + aggregator_generic() : pending_operations(nullptr), handler_busy(false) {} + + // Execute an operation + /* Places an operation into the waitlist (pending_operations), and either handles the list, + or waits for the operation to complete, or returns. + The long_life_time parameter specifies the life time of the given operation object. + Operations with long_life_time == true may be accessed after execution. + A "short" life time operation (long_life_time == false) can be destroyed + during execution, and so any access to it after it was put into the waitlist, + including status check, is invalid. As a consequence, waiting for completion + of such operation causes undefined behavior. */ + template + void execute( OperationType* op, HandlerType& handle_operations, bool long_life_time = true ) { + // op->status should be read before inserting the operation into the + // aggregator waitlist since it can become invalid after executing a + // handler (if the operation has 'short' life time.) + const uintptr_t status = op->status.load(std::memory_order_relaxed); + + // ITT note: &(op->status) tag is used to cover accesses to this op node. This + // thread has created the operation, and now releases it so that the handler + // thread may handle the associated operation w/o triggering a race condition; + // thus this tag will be acquired just before the operation is handled in the + // handle_operations functor. + call_itt_notify(releasing, &(op->status)); + // insert the operation in the queue. + OperationType* res = pending_operations.load(std::memory_order_relaxed); + do { + op->next.store(res, std::memory_order_relaxed); + } while (!pending_operations.compare_exchange_strong(res, op)); + if (!res) { // first in the list; handle the operations + // ITT note: &pending_operations tag covers access to the handler_busy flag, + // which this waiting handler thread will try to set before entering + // handle_operations. + call_itt_notify(acquired, &pending_operations); + start_handle_operations(handle_operations); + // The operation with 'short' life time can already be destroyed + if (long_life_time) + __TBB_ASSERT(op->status.load(std::memory_order_relaxed), nullptr); + } + // Not first; wait for op to be ready + else if (!status) { // operation is blocking here. + __TBB_ASSERT(long_life_time, "Waiting for an operation object that might be destroyed during processing"); + call_itt_notify(prepare, &(op->status)); + spin_wait_while_eq(op->status, uintptr_t(0)); + } + } + +private: + // Trigger the handling of operations when the handler is free + template + void start_handle_operations( HandlerType& handle_operations ) { + OperationType* op_list; + + // ITT note: &handler_busy tag covers access to pending_operations as it is passed + // between active and waiting handlers. Below, the waiting handler waits until + // the active handler releases, and the waiting handler acquires &handler_busy as + // it becomes the active_handler. The release point is at the end of this + // function, when all operations in pending_operations have been handled by the + // owner of this aggregator. + call_itt_notify(prepare, &handler_busy); + // get the handler_busy: + // only one thread can possibly spin here at a time + spin_wait_until_eq(handler_busy, uintptr_t(0)); + call_itt_notify(acquired, &handler_busy); + // acquire fence not necessary here due to causality rule and surrounding atomics + handler_busy.store(1, std::memory_order_relaxed); + + // ITT note: &pending_operations tag covers access to the handler_busy flag + // itself. Capturing the state of the pending_operations signifies that + // handler_busy has been set and a new active handler will now process that list's + // operations. + call_itt_notify(releasing, &pending_operations); + // grab pending_operations + op_list = pending_operations.exchange(nullptr); + + // handle all the operations + handle_operations(op_list); + + // release the handler + handler_busy.store(0, std::memory_order_release); + } + + // An atomically updated list (aka mailbox) of pending operations + std::atomic pending_operations; + // Controls threads access to handle_operations + std::atomic handler_busy; +}; // class aggregator_generic + +template +class aggregator : public aggregator_generic { + HandlerType handle_operations; +public: + aggregator() = default; + + void initialize_handler( HandlerType h ) { handle_operations = h; } + + void execute(OperationType* op) { + aggregator_generic::execute(op, handle_operations); + } +}; // class aggregator + +// the most-compatible friend declaration (vs, gcc, icc) is +// template friend class aggregating_functor; +template +class aggregating_functor { + AggregatingClass* my_object{nullptr}; +public: + aggregating_functor() = default; + aggregating_functor( AggregatingClass* object ) : my_object(object) { + __TBB_ASSERT(my_object, nullptr); + } + + void operator()( OperationList* op_list ) { + __TBB_ASSERT(my_object, nullptr); + my_object->handle_operations(op_list); + } +}; // class aggregating_functor + + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_detail__aggregator_H diff --git a/third_party/tbb/detail/_aligned_space.h b/third_party/tbb/detail/_aligned_space.h new file mode 100644 index 000000000..9a5addba4 --- /dev/null +++ b/third_party/tbb/detail/_aligned_space.h @@ -0,0 +1,47 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef __TBB_aligned_space_H +#define __TBB_aligned_space_H + +#include "third_party/libcxx/cstddef" + +#include "third_party/tbb/detail/_template_helpers.h" + +namespace tbb { +namespace detail { +inline namespace d0 { + +//! Block of space aligned sufficiently to construct an array T with N elements. +/** The elements are not constructed or destroyed by this class. + @ingroup memory_allocation */ +template +class aligned_space { + alignas(alignof(T)) std::uint8_t aligned_array[N * sizeof(T)]; + +public: + //! Pointer to beginning of array + T* begin() const { return punned_cast(&aligned_array); } + + //! Pointer to one past last element in array. + T* end() const { return begin() + N; } +}; + +} // namespace d0 +} // namespace detail +} // namespace tbb + +#endif /* __TBB_aligned_space_H */ diff --git a/third_party/tbb/detail/_allocator_traits.h b/third_party/tbb/detail/_allocator_traits.h new file mode 100644 index 000000000..366cc63d1 --- /dev/null +++ b/third_party/tbb/detail/_allocator_traits.h @@ -0,0 +1,108 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__allocator_traits_H +#define __TBB_detail__allocator_traits_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/libcxx/memory" +#include "third_party/libcxx/type_traits" + +namespace tbb { +namespace detail { +inline namespace d0 { + +#if !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT +// Struct is_always_equal_detector provides the member type "type" which is +// Allocator::is_always_equal if it is present, std::false_type otherwise +template +struct is_always_equal_detector { + using type = std::false_type; +}; + +template +struct is_always_equal_detector> +{ + using type = typename Allocator::is_always_equal; +}; +#endif // !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT + +template +class allocator_traits : public std::allocator_traits +{ + using base_type = std::allocator_traits; +public: +#if !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT + using is_always_equal = typename is_always_equal_detector::type; +#endif + + template + using rebind_traits = typename tbb::detail::allocator_traits>; +}; // struct allocator_traits + +template +void copy_assign_allocators_impl( Allocator& lhs, const Allocator& rhs, /*pocca = */std::true_type ) { + lhs = rhs; +} + +template +void copy_assign_allocators_impl( Allocator&, const Allocator&, /*pocca = */ std::false_type ) {} + +// Copy assigns allocators only if propagate_on_container_copy_assignment is true +template +void copy_assign_allocators( Allocator& lhs, const Allocator& rhs ) { + using pocca_type = typename allocator_traits::propagate_on_container_copy_assignment; + copy_assign_allocators_impl(lhs, rhs, pocca_type()); +} + +template +void move_assign_allocators_impl( Allocator& lhs, Allocator& rhs, /*pocma = */ std::true_type ) { + lhs = std::move(rhs); +} + +template +void move_assign_allocators_impl( Allocator&, Allocator&, /*pocma = */ std::false_type ) {} + +// Move assigns allocators only if propagate_on_container_move_assignment is true +template +void move_assign_allocators( Allocator& lhs, Allocator& rhs ) { + using pocma_type = typename allocator_traits::propagate_on_container_move_assignment; + move_assign_allocators_impl(lhs, rhs, pocma_type()); +} + +template +void swap_allocators_impl( Allocator& lhs, Allocator& rhs, /*pocs = */ std::true_type ) { + using std::swap; + swap(lhs, rhs); +} + +template +void swap_allocators_impl( Allocator&, Allocator&, /*pocs = */ std::false_type ) {} + +// Swaps allocators only if propagate_on_container_swap is true +template +void swap_allocators( Allocator& lhs, Allocator& rhs ) { + using pocs_type = typename allocator_traits::propagate_on_container_swap; + swap_allocators_impl(lhs, rhs, pocs_type()); +} + +} // inline namespace d0 +} // namespace detail +} // namespace tbb + +#endif // __TBB_detail__allocator_traits_H diff --git a/third_party/tbb/detail/_assert.h b/third_party/tbb/detail/_assert.h new file mode 100644 index 000000000..0d1210860 --- /dev/null +++ b/third_party/tbb/detail/_assert.h @@ -0,0 +1,65 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__assert_H +#define __TBB_detail__assert_H + +#include "third_party/tbb/detail/_config.h" + +#if __TBBMALLOC_BUILD +namespace rml { namespace internal { +#else +namespace tbb { +namespace detail { +namespace r1 { +#endif +//! Process an assertion failure. +/** Normally called from __TBB_ASSERT macro. + If assertion handler is null, print message for assertion failure and abort. + Otherwise call the assertion handler. */ +TBB_EXPORT void __TBB_EXPORTED_FUNC assertion_failure(const char* location, int line, const char* expression, const char* comment); +#if __TBBMALLOC_BUILD +}} // namespaces rml::internal +#else +} // namespace r1 +} // namespace detail +} // namespace tbb +#endif + +#if __TBBMALLOC_BUILD +//! Release version of assertions +#define __TBB_ASSERT_RELEASE(predicate,message) ((predicate)?((void)0) : rml::internal::assertion_failure(__func__,__LINE__,#predicate,message)) +#else +#define __TBB_ASSERT_RELEASE(predicate,message) ((predicate)?((void)0) : tbb::detail::r1::assertion_failure(__func__,__LINE__,#predicate,message)) +#endif + +#if TBB_USE_ASSERT + //! Assert that predicate is true. + /** If predicate is false, print assertion failure message. + If the comment argument is not nullptr, it is printed as part of the failure message. + The comment argument has no other effect. */ + #define __TBB_ASSERT(predicate,message) __TBB_ASSERT_RELEASE(predicate,message) + //! "Extended" version + #define __TBB_ASSERT_EX __TBB_ASSERT +#else + //! No-op version of __TBB_ASSERT. + #define __TBB_ASSERT(predicate,comment) ((void)0) + //! "Extended" version is useful to suppress warnings if a variable is only used with an assert + #define __TBB_ASSERT_EX(predicate,comment) ((void)(1 && (predicate))) +#endif // TBB_USE_ASSERT + +#endif // __TBB_detail__assert_H diff --git a/third_party/tbb/detail/_attach.h b/third_party/tbb/detail/_attach.h new file mode 100644 index 000000000..ddf21d590 --- /dev/null +++ b/third_party/tbb/detail/_attach.h @@ -0,0 +1,33 @@ +// clang-format off +/* + Copyright (c) 2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__attach_H +#define __TBB_detail__attach_H + +#include "third_party/tbb/detail/_config.h" + +namespace tbb { +namespace detail { +namespace d1 { + + struct attach {}; + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_detail__attach_H diff --git a/third_party/tbb/detail/_concurrent_queue_base.h b/third_party/tbb/detail/_concurrent_queue_base.h new file mode 100644 index 000000000..2cec3e168 --- /dev/null +++ b/third_party/tbb/detail/_concurrent_queue_base.h @@ -0,0 +1,651 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__concurrent_queue_base_H +#define __TBB_detail__concurrent_queue_base_H + +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_machine.h" +#include "third_party/tbb/detail/_allocator_traits.h" + +#include "third_party/tbb/profiling.h" +#include "third_party/tbb/spin_mutex.h" +#include "third_party/tbb/cache_aligned_allocator.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace d2 { + +using ticket_type = std::size_t; + +template +inline bool is_valid_page(const Page p) { + return reinterpret_cast(p) > 1; +} + +template +struct concurrent_queue_rep; + +template +class micro_queue_pop_finalizer; + +#if _MSC_VER && !defined(__INTEL_COMPILER) +// unary minus operator applied to unsigned type, result still unsigned +#pragma warning( push ) +#pragma warning( disable: 4146 ) +#endif + +// A queue using simple locking. +// For efficiency, this class has no constructor. +// The caller is expected to zero-initialize it. +template +class micro_queue { +private: + using queue_rep_type = concurrent_queue_rep; + using self_type = micro_queue; +public: + using size_type = std::size_t; + using value_type = T; + using reference = value_type&; + using const_reference = const value_type&; + + using allocator_type = Allocator; + using allocator_traits_type = tbb::detail::allocator_traits; + using queue_allocator_type = typename allocator_traits_type::template rebind_alloc; + + static constexpr size_type item_size = sizeof(T); + static constexpr size_type items_per_page = item_size <= 8 ? 32 : + item_size <= 16 ? 16 : + item_size <= 32 ? 8 : + item_size <= 64 ? 4 : + item_size <= 128 ? 2 : 1; + + struct padded_page { + padded_page() {} + ~padded_page() {} + + reference operator[] (std::size_t index) { + __TBB_ASSERT(index < items_per_page, "Index out of range"); + return items[index]; + } + + const_reference operator[] (std::size_t index) const { + __TBB_ASSERT(index < items_per_page, "Index out of range"); + return items[index]; + } + + padded_page* next{ nullptr }; + std::atomic mask{}; + + union { + value_type items[items_per_page]; + }; + }; // struct padded_page + + using page_allocator_type = typename allocator_traits_type::template rebind_alloc; +protected: + using page_allocator_traits = tbb::detail::allocator_traits; + +public: + using item_constructor_type = void (*)(value_type* location, const void* src); + micro_queue() = default; + micro_queue( const micro_queue& ) = delete; + micro_queue& operator=( const micro_queue& ) = delete; + + size_type prepare_page( ticket_type k, queue_rep_type& base, page_allocator_type page_allocator, + padded_page*& p ) { + __TBB_ASSERT(p == nullptr, "Invalid page argument for prepare_page"); + k &= -queue_rep_type::n_queue; + size_type index = modulo_power_of_two(k / queue_rep_type::n_queue, items_per_page); + if (!index) { + try_call( [&] { + p = page_allocator_traits::allocate(page_allocator, 1); + }).on_exception( [&] { + ++base.n_invalid_entries; + invalidate_page( k ); + }); + page_allocator_traits::construct(page_allocator, p); + } + + spin_wait_until_my_turn(tail_counter, k, base); + d1::call_itt_notify(d1::acquired, &tail_counter); + + if (p) { + spin_mutex::scoped_lock lock( page_mutex ); + padded_page* q = tail_page.load(std::memory_order_relaxed); + if (is_valid_page(q)) { + q->next = p; + } else { + head_page.store(p, std::memory_order_relaxed); + } + tail_page.store(p, std::memory_order_relaxed); + } else { + p = tail_page.load(std::memory_order_relaxed); + } + return index; + } + + template + void push( ticket_type k, queue_rep_type& base, queue_allocator_type& allocator, Args&&... args ) + { + padded_page* p = nullptr; + page_allocator_type page_allocator(allocator); + size_type index = prepare_page(k, base, page_allocator, p); + __TBB_ASSERT(p != nullptr, "Page was not prepared"); + + // try_call API is not convenient here due to broken + // variadic capture on GCC 4.8.5 + auto value_guard = make_raii_guard([&] { + ++base.n_invalid_entries; + d1::call_itt_notify(d1::releasing, &tail_counter); + tail_counter.fetch_add(queue_rep_type::n_queue); + }); + + page_allocator_traits::construct(page_allocator, &(*p)[index], std::forward(args)...); + // If no exception was thrown, mark item as present. + p->mask.store(p->mask.load(std::memory_order_relaxed) | uintptr_t(1) << index, std::memory_order_relaxed); + d1::call_itt_notify(d1::releasing, &tail_counter); + + value_guard.dismiss(); + tail_counter.fetch_add(queue_rep_type::n_queue); + } + + void abort_push( ticket_type k, queue_rep_type& base, queue_allocator_type& allocator ) { + padded_page* p = nullptr; + prepare_page(k, base, allocator, p); + ++base.n_invalid_entries; + tail_counter.fetch_add(queue_rep_type::n_queue); + } + + bool pop( void* dst, ticket_type k, queue_rep_type& base, queue_allocator_type& allocator ) { + k &= -queue_rep_type::n_queue; + spin_wait_until_eq(head_counter, k); + d1::call_itt_notify(d1::acquired, &head_counter); + spin_wait_while_eq(tail_counter, k); + d1::call_itt_notify(d1::acquired, &tail_counter); + padded_page *p = head_page.load(std::memory_order_relaxed); + __TBB_ASSERT( p, nullptr ); + size_type index = modulo_power_of_two( k/queue_rep_type::n_queue, items_per_page ); + bool success = false; + { + page_allocator_type page_allocator(allocator); + micro_queue_pop_finalizer finalizer(*this, page_allocator, + k + queue_rep_type::n_queue, index == items_per_page - 1 ? p : nullptr ); + if (p->mask.load(std::memory_order_relaxed) & (std::uintptr_t(1) << index)) { + success = true; + assign_and_destroy_item(dst, *p, index); + } else { + --base.n_invalid_entries; + } + } + return success; + } + + micro_queue& assign( const micro_queue& src, queue_allocator_type& allocator, + item_constructor_type construct_item ) + { + head_counter.store(src.head_counter.load(std::memory_order_relaxed), std::memory_order_relaxed); + tail_counter.store(src.tail_counter.load(std::memory_order_relaxed), std::memory_order_relaxed); + + const padded_page* srcp = src.head_page.load(std::memory_order_relaxed); + if( is_valid_page(srcp) ) { + ticket_type g_index = head_counter.load(std::memory_order_relaxed); + size_type n_items = (tail_counter.load(std::memory_order_relaxed) - head_counter.load(std::memory_order_relaxed)) + / queue_rep_type::n_queue; + size_type index = modulo_power_of_two(head_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue, items_per_page); + size_type end_in_first_page = (index+n_items < items_per_page) ? (index + n_items) : items_per_page; + + try_call( [&] { + head_page.store(make_copy(allocator, srcp, index, end_in_first_page, g_index, construct_item), std::memory_order_relaxed); + }).on_exception( [&] { + head_counter.store(0, std::memory_order_relaxed); + tail_counter.store(0, std::memory_order_relaxed); + }); + padded_page* cur_page = head_page.load(std::memory_order_relaxed); + + try_call( [&] { + if (srcp != src.tail_page.load(std::memory_order_relaxed)) { + for (srcp = srcp->next; srcp != src.tail_page.load(std::memory_order_relaxed); srcp=srcp->next ) { + cur_page->next = make_copy( allocator, srcp, 0, items_per_page, g_index, construct_item ); + cur_page = cur_page->next; + } + + __TBB_ASSERT(srcp == src.tail_page.load(std::memory_order_relaxed), nullptr ); + size_type last_index = modulo_power_of_two(tail_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue, items_per_page); + if( last_index==0 ) last_index = items_per_page; + + cur_page->next = make_copy( allocator, srcp, 0, last_index, g_index, construct_item ); + cur_page = cur_page->next; + } + tail_page.store(cur_page, std::memory_order_relaxed); + }).on_exception( [&] { + padded_page* invalid_page = reinterpret_cast(std::uintptr_t(1)); + tail_page.store(invalid_page, std::memory_order_relaxed); + }); + } else { + head_page.store(nullptr, std::memory_order_relaxed); + tail_page.store(nullptr, std::memory_order_relaxed); + } + return *this; + } + + padded_page* make_copy( queue_allocator_type& allocator, const padded_page* src_page, size_type begin_in_page, + size_type end_in_page, ticket_type& g_index, item_constructor_type construct_item ) + { + page_allocator_type page_allocator(allocator); + padded_page* new_page = page_allocator_traits::allocate(page_allocator, 1); + new_page->next = nullptr; + new_page->mask.store(src_page->mask.load(std::memory_order_relaxed), std::memory_order_relaxed); + for (; begin_in_page!=end_in_page; ++begin_in_page, ++g_index) { + if (new_page->mask.load(std::memory_order_relaxed) & uintptr_t(1) << begin_in_page) { + copy_item(*new_page, begin_in_page, *src_page, begin_in_page, construct_item); + } + } + return new_page; + } + + void invalidate_page( ticket_type k ) { + // Append an invalid page at address 1 so that no more pushes are allowed. + padded_page* invalid_page = reinterpret_cast(std::uintptr_t(1)); + { + spin_mutex::scoped_lock lock( page_mutex ); + tail_counter.store(k + queue_rep_type::n_queue + 1, std::memory_order_relaxed); + padded_page* q = tail_page.load(std::memory_order_relaxed); + if (is_valid_page(q)) { + q->next = invalid_page; + } else { + head_page.store(invalid_page, std::memory_order_relaxed); + } + tail_page.store(invalid_page, std::memory_order_relaxed); + } + } + + padded_page* get_head_page() { + return head_page.load(std::memory_order_relaxed); + } + + void clear(queue_allocator_type& allocator, padded_page* new_head = nullptr, padded_page* new_tail = nullptr) { + padded_page* curr_page = get_head_page(); + size_type index = (head_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue) % items_per_page; + page_allocator_type page_allocator(allocator); + + while (curr_page && is_valid_page(curr_page)) { + while (index != items_per_page) { + if (curr_page->mask.load(std::memory_order_relaxed) & (std::uintptr_t(1) << index)) { + page_allocator_traits::destroy(page_allocator, &curr_page->operator[](index)); + } + ++index; + } + + index = 0; + padded_page* next_page = curr_page->next; + page_allocator_traits::destroy(page_allocator, curr_page); + page_allocator_traits::deallocate(page_allocator, curr_page, 1); + curr_page = next_page; + } + head_counter.store(0, std::memory_order_relaxed); + tail_counter.store(0, std::memory_order_relaxed); + head_page.store(new_head, std::memory_order_relaxed); + tail_page.store(new_tail, std::memory_order_relaxed); + } + + void clear_and_invalidate(queue_allocator_type& allocator) { + padded_page* invalid_page = reinterpret_cast(std::uintptr_t(1)); + clear(allocator, invalid_page, invalid_page); + } + +private: + // template + friend class micro_queue_pop_finalizer; + + // Class used to ensure exception-safety of method "pop" + class destroyer { + value_type& my_value; + public: + destroyer( reference value ) : my_value(value) {} + destroyer( const destroyer& ) = delete; + destroyer& operator=( const destroyer& ) = delete; + ~destroyer() {my_value.~T();} + }; // class destroyer + + void copy_item( padded_page& dst, size_type dindex, const padded_page& src, size_type sindex, + item_constructor_type construct_item ) + { + auto& src_item = src[sindex]; + construct_item( &dst[dindex], static_cast(&src_item) ); + } + + void assign_and_destroy_item( void* dst, padded_page& src, size_type index ) { + auto& from = src[index]; + destroyer d(from); + *static_cast(dst) = std::move(from); + } + + void spin_wait_until_my_turn( std::atomic& counter, ticket_type k, queue_rep_type& rb ) const { + for (atomic_backoff b{};; b.pause()) { + ticket_type c = counter.load(std::memory_order_acquire); + if (c == k) return; + else if (c & 1) { + ++rb.n_invalid_entries; + throw_exception( exception_id::bad_last_alloc); + } + } + } + + std::atomic head_page{}; + std::atomic head_counter{}; + + std::atomic tail_page{}; + std::atomic tail_counter{}; + + spin_mutex page_mutex{}; +}; // class micro_queue + +#if _MSC_VER && !defined(__INTEL_COMPILER) +#pragma warning( pop ) +#endif // warning 4146 is back + +template +class micro_queue_pop_finalizer { +public: + using padded_page = typename Container::padded_page; + using allocator_type = Allocator; + using allocator_traits_type = tbb::detail::allocator_traits; + + micro_queue_pop_finalizer( Container& queue, Allocator& alloc, ticket_type k, padded_page* p ) : + my_ticket_type(k), my_queue(queue), my_page(p), allocator(alloc) + {} + + micro_queue_pop_finalizer( const micro_queue_pop_finalizer& ) = delete; + micro_queue_pop_finalizer& operator=( const micro_queue_pop_finalizer& ) = delete; + + ~micro_queue_pop_finalizer() { + padded_page* p = my_page; + if( is_valid_page(p) ) { + spin_mutex::scoped_lock lock( my_queue.page_mutex ); + padded_page* q = p->next; + my_queue.head_page.store(q, std::memory_order_relaxed); + if( !is_valid_page(q) ) { + my_queue.tail_page.store(nullptr, std::memory_order_relaxed); + } + } + my_queue.head_counter.store(my_ticket_type, std::memory_order_release); + if ( is_valid_page(p) ) { + allocator_traits_type::destroy(allocator, static_cast(p)); + allocator_traits_type::deallocate(allocator, static_cast(p), 1); + } + } +private: + ticket_type my_ticket_type; + Container& my_queue; + padded_page* my_page; + Allocator& allocator; +}; // class micro_queue_pop_finalizer + +#if _MSC_VER && !defined(__INTEL_COMPILER) +// structure was padded due to alignment specifier +#pragma warning( push ) +#pragma warning( disable: 4324 ) +#endif + +template +struct concurrent_queue_rep { + using self_type = concurrent_queue_rep; + using size_type = std::size_t; + using micro_queue_type = micro_queue; + using allocator_type = Allocator; + using allocator_traits_type = tbb::detail::allocator_traits; + using padded_page = typename micro_queue_type::padded_page; + using page_allocator_type = typename micro_queue_type::page_allocator_type; + using item_constructor_type = typename micro_queue_type::item_constructor_type; +private: + using page_allocator_traits = tbb::detail::allocator_traits; + using queue_allocator_type = typename allocator_traits_type::template rebind_alloc; + +public: + // must be power of 2 + static constexpr size_type n_queue = 8; + // Approximately n_queue/golden ratio + static constexpr size_type phi = 3; + static constexpr size_type item_size = micro_queue_type::item_size; + static constexpr size_type items_per_page = micro_queue_type::items_per_page; + + concurrent_queue_rep() {} + + concurrent_queue_rep( const concurrent_queue_rep& ) = delete; + concurrent_queue_rep& operator=( const concurrent_queue_rep& ) = delete; + + void clear( queue_allocator_type& alloc ) { + for (size_type index = 0; index < n_queue; ++index) { + array[index].clear(alloc); + } + head_counter.store(0, std::memory_order_relaxed); + tail_counter.store(0, std::memory_order_relaxed); + n_invalid_entries.store(0, std::memory_order_relaxed); + } + + void assign( const concurrent_queue_rep& src, queue_allocator_type& alloc, item_constructor_type construct_item ) { + head_counter.store(src.head_counter.load(std::memory_order_relaxed), std::memory_order_relaxed); + tail_counter.store(src.tail_counter.load(std::memory_order_relaxed), std::memory_order_relaxed); + n_invalid_entries.store(src.n_invalid_entries.load(std::memory_order_relaxed), std::memory_order_relaxed); + + // copy or move micro_queues + size_type queue_idx = 0; + try_call( [&] { + for (; queue_idx < n_queue; ++queue_idx) { + array[queue_idx].assign(src.array[queue_idx], alloc, construct_item); + } + }).on_exception( [&] { + for (size_type i = 0; i < queue_idx + 1; ++i) { + array[i].clear_and_invalidate(alloc); + } + head_counter.store(0, std::memory_order_relaxed); + tail_counter.store(0, std::memory_order_relaxed); + n_invalid_entries.store(0, std::memory_order_relaxed); + }); + + __TBB_ASSERT(head_counter.load(std::memory_order_relaxed) == src.head_counter.load(std::memory_order_relaxed) && + tail_counter.load(std::memory_order_relaxed) == src.tail_counter.load(std::memory_order_relaxed), + "the source concurrent queue should not be concurrently modified." ); + } + + bool empty() const { + ticket_type tc = tail_counter.load(std::memory_order_acquire); + ticket_type hc = head_counter.load(std::memory_order_relaxed); + // if tc!=r.tail_counter, the queue was not empty at some point between the two reads. + return tc == tail_counter.load(std::memory_order_relaxed) && + std::ptrdiff_t(tc - hc - n_invalid_entries.load(std::memory_order_relaxed)) <= 0; + } + + std::ptrdiff_t size() const { + __TBB_ASSERT(sizeof(std::ptrdiff_t) <= sizeof(size_type), nullptr); + std::ptrdiff_t hc = head_counter.load(std::memory_order_acquire); + std::ptrdiff_t tc = tail_counter.load(std::memory_order_relaxed); + std::ptrdiff_t nie = n_invalid_entries.load(std::memory_order_relaxed); + + return tc - hc - nie; + } + + friend class micro_queue; + + // Map ticket_type to an array index + static size_type index( ticket_type k ) { + return k * phi % n_queue; + } + + micro_queue_type& choose( ticket_type k ) { + // The formula here approximates LRU in a cache-oblivious way. + return array[index(k)]; + } + + alignas(max_nfs_size) micro_queue_type array[n_queue]; + + alignas(max_nfs_size) std::atomic head_counter{}; + alignas(max_nfs_size) std::atomic tail_counter{}; + alignas(max_nfs_size) std::atomic n_invalid_entries{}; +}; // class concurrent_queue_rep + +#if _MSC_VER && !defined(__INTEL_COMPILER) +#pragma warning( pop ) +#endif + +template +class concurrent_queue_iterator_base { + using queue_rep_type = concurrent_queue_rep; + using padded_page = typename queue_rep_type::padded_page; +protected: + concurrent_queue_iterator_base() = default; + + concurrent_queue_iterator_base( const concurrent_queue_iterator_base& other ) { + assign(other); + } + + concurrent_queue_iterator_base( queue_rep_type* queue_rep ) + : my_queue_rep(queue_rep), + my_head_counter(my_queue_rep->head_counter.load(std::memory_order_relaxed)) + { + for (std::size_t i = 0; i < queue_rep_type::n_queue; ++i) { + my_array[i] = my_queue_rep->array[i].get_head_page(); + } + + if (!get_item(my_item, my_head_counter)) advance(); + } + + void assign( const concurrent_queue_iterator_base& other ) { + my_item = other.my_item; + my_queue_rep = other.my_queue_rep; + + if (my_queue_rep != nullptr) { + my_head_counter = other.my_head_counter; + + for (std::size_t i = 0; i < queue_rep_type::n_queue; ++i) { + my_array[i] = other.my_array[i]; + } + } + } + + void advance() { + __TBB_ASSERT(my_item, "Attempt to increment iterator past end of the queue"); + std::size_t k = my_head_counter; +#if TBB_USE_ASSERT + Value* tmp; + get_item(tmp, k); + __TBB_ASSERT(my_item == tmp, nullptr); +#endif + std::size_t i = modulo_power_of_two(k / queue_rep_type::n_queue, my_queue_rep->items_per_page); + if (i == my_queue_rep->items_per_page - 1) { + padded_page*& root = my_array[queue_rep_type::index(k)]; + root = root->next; + } + // Advance k + my_head_counter = ++k; + if (!get_item(my_item, k)) advance(); + } + + concurrent_queue_iterator_base& operator=( const concurrent_queue_iterator_base& other ) { + this->assign(other); + return *this; + } + + bool get_item( Value*& item, std::size_t k ) { + if (k == my_queue_rep->tail_counter.load(std::memory_order_relaxed)) { + item = nullptr; + return true; + } else { + padded_page* p = my_array[queue_rep_type::index(k)]; + __TBB_ASSERT(p, nullptr); + std::size_t i = modulo_power_of_two(k / queue_rep_type::n_queue, my_queue_rep->items_per_page); + item = &(*p)[i]; + return (p->mask & uintptr_t(1) << i) != 0; + } + } + + Value* my_item{ nullptr }; + queue_rep_type* my_queue_rep{ nullptr }; + ticket_type my_head_counter{}; + padded_page* my_array[queue_rep_type::n_queue]{}; +}; // class concurrent_queue_iterator_base + +struct concurrent_queue_iterator_provider { + template + static Iterator get( const Container& container ) { + return Iterator(container); + } +}; // struct concurrent_queue_iterator_provider + +template +class concurrent_queue_iterator : public concurrent_queue_iterator_base::type, Allocator> { + using base_type = concurrent_queue_iterator_base::type, Allocator>; +public: + using value_type = Value; + using pointer = value_type*; + using reference = value_type&; + using difference_type = std::ptrdiff_t; + using iterator_category = std::forward_iterator_tag; + + concurrent_queue_iterator() = default; + + /** If Value==Container::value_type, then this routine is the copy constructor. + If Value==const Container::value_type, then this routine is a conversion constructor. */ + concurrent_queue_iterator( const concurrent_queue_iterator& other ) + : base_type(other) {} + +private: + concurrent_queue_iterator( const Container& container ) + : base_type(container.my_queue_representation) {} +public: + concurrent_queue_iterator& operator=( const concurrent_queue_iterator& other ) { + this->assign(other); + return *this; + } + + reference operator*() const { + return *static_cast(this->my_item); + } + + pointer operator->() const { return &operator*(); } + + concurrent_queue_iterator& operator++() { + this->advance(); + return *this; + } + + concurrent_queue_iterator operator++(int) { + concurrent_queue_iterator tmp = *this; + ++*this; + return tmp; + } + + friend bool operator==( const concurrent_queue_iterator& lhs, const concurrent_queue_iterator& rhs ) { + return lhs.my_item == rhs.my_item; + } + + friend bool operator!=( const concurrent_queue_iterator& lhs, const concurrent_queue_iterator& rhs ) { + return lhs.my_item != rhs.my_item; + } +private: + friend struct concurrent_queue_iterator_provider; +}; // class concurrent_queue_iterator + +} // namespace d2 +} // namespace detail +} // tbb + +#endif // __TBB_detail__concurrent_queue_base_H diff --git a/third_party/tbb/detail/_concurrent_skip_list.h b/third_party/tbb/detail/_concurrent_skip_list.h new file mode 100644 index 000000000..df1f80f07 --- /dev/null +++ b/third_party/tbb/detail/_concurrent_skip_list.h @@ -0,0 +1,1291 @@ +// clang-format off +/* + Copyright (c) 2019-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__concurrent_skip_list_H +#define __TBB_detail__concurrent_skip_list_H + +#if !defined(__TBB_concurrent_map_H) && !defined(__TBB_concurrent_set_H) +#error Do not #include this internal file directly; use public TBB headers instead. +#endif + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_range_common.h" +#include "third_party/tbb/detail/_allocator_traits.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_node_handle.h" +#include "third_party/tbb/detail/_containers_helpers.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/enumerable_thread_specific.h" +#include "third_party/libcxx/utility" +#include "third_party/libcxx/initializer_list" +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/array" +#include "third_party/libcxx/type_traits" +#include "third_party/libcxx/random" // Need std::geometric_distribution +#include "third_party/libcxx/algorithm" // Need std::equal and std::lexicographical_compare +#include "third_party/libcxx/cstdint" +#if __TBB_CPP20_COMPARISONS_PRESENT +#include "third_party/libcxx/compare" +#endif + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma warning(push) +#pragma warning(disable: 4127) // warning C4127: conditional expression is constant +#endif + +namespace tbb { +namespace detail { +namespace d2 { + +template +class skip_list_node { + using node_ptr = skip_list_node*; +public: + using value_type = Value; + using atomic_node_ptr = std::atomic; + using size_type = std::size_t; + using container_allocator_type = Allocator; + + using reference = value_type&; + using const_reference = const value_type&; +private: + using allocator_traits = tbb::detail::allocator_traits; + + // Allocator is the same as the container allocator=> allocates unitptr_t + // It is required to rebind it to value_type to get the correct pointer and const_pointer + using value_allocator_traits = typename allocator_traits::template rebind_traits; +public: + using pointer = typename value_allocator_traits::pointer; + using const_pointer = typename value_allocator_traits::const_pointer; + + //In perfect world these constructor and destructor would have been private, + //however this seems technically impractical due to use of allocator_traits. + + //Should not be called directly, instead use create method + skip_list_node( size_type levels ) + : my_height(levels), my_index_number(0) + {} + + //Should not be called directly, instead use destroy method + ~skip_list_node() {} + + skip_list_node( const skip_list_node& ) = delete; + skip_list_node( skip_list_node&& ) = delete; + skip_list_node& operator=( const skip_list_node& ) = delete; + skip_list_node& operator=( skip_list_node&& ) = delete; + + static skip_list_node* create( container_allocator_type& alloc, size_type height ) { + size_type sz = calc_node_size(height); + static_assert(std::is_same::value, "skip_list_node assumes that passed in allocator operates on bytes"); + auto* node = reinterpret_cast(allocator_traits::allocate(alloc, sz)); + + //Construct the node itself + allocator_traits::construct(alloc, node, height); + + //Construct the level pointers + for (size_type l = 0; l < height; ++l) { + allocator_traits::construct(alloc, &node->get_atomic_next(l), nullptr); + } + + return node; + } + + static void destroy( container_allocator_type& alloc, skip_list_node* node ) { + //Destroy the level pointers + for (size_type l = 0; l < node->height(); ++l) { + allocator_traits::destroy(alloc, &node->atomic_next(l)); + } + size_type sz = calc_node_size(node->height()); + // Destroy the node itself + allocator_traits::destroy(alloc, node); + + // Deallocate the node + allocator_traits::deallocate(alloc, reinterpret_cast(node), sz); + } + + + pointer storage() { + return &my_value; + } + + reference value() { + return *storage(); + } + + node_ptr next( size_type level ) const { + node_ptr res = get_atomic_next(level).load(std::memory_order_acquire); + __TBB_ASSERT(res == nullptr || res->height() > level, "Broken internal structure"); + return res; + } + + atomic_node_ptr& atomic_next( size_type level ) { + atomic_node_ptr& res = get_atomic_next(level); +#if TBB_USE_DEBUG + node_ptr node = res.load(std::memory_order_acquire); + __TBB_ASSERT(node == nullptr || node->height() > level, "Broken internal structure"); +#endif + return res; + } + + void set_next( size_type level, node_ptr n ) { + __TBB_ASSERT(n == nullptr || n->height() > level, "Broken internal structure"); + get_atomic_next(level).store(n, std::memory_order_relaxed); + } + + size_type height() const { + return my_height; + } + + void set_index_number( size_type index_num ) { + my_index_number = index_num; + } + + size_type index_number() const { + return my_index_number; + } + +private: + static size_type calc_node_size( size_type height ) { + static_assert(alignof(skip_list_node) >= alignof(atomic_node_ptr), "Incorrect alignment"); + return sizeof(skip_list_node) + height * sizeof(atomic_node_ptr); + } + + atomic_node_ptr& get_atomic_next( size_type level ) { + atomic_node_ptr* arr = reinterpret_cast(this + 1); + return arr[level]; + } + + const atomic_node_ptr& get_atomic_next( size_type level ) const { + const atomic_node_ptr* arr = reinterpret_cast(this + 1); + return arr[level]; + } + + union { + value_type my_value; + }; + size_type my_height; + size_type my_index_number; +}; // class skip_list_node + +template +class skip_list_iterator { + using node_type = NodeType; + using node_ptr = node_type*; +public: + using iterator_category = std::forward_iterator_tag; + using value_type = ValueType; + + using difference_type = std::ptrdiff_t; + using pointer = value_type*; + using reference = value_type&; + + skip_list_iterator() : skip_list_iterator(nullptr) {} + + skip_list_iterator( const skip_list_iterator& other ) + : my_node_ptr(other.my_node_ptr) {} + + skip_list_iterator& operator=( const skip_list_iterator& other ) { + my_node_ptr = other.my_node_ptr; + return *this; + } + + reference operator*() const { return my_node_ptr->value(); } + pointer operator->() const { return my_node_ptr->storage(); } + + skip_list_iterator& operator++() { + __TBB_ASSERT(my_node_ptr != nullptr, nullptr); + my_node_ptr = my_node_ptr->next(0); + return *this; + } + + skip_list_iterator operator++(int) { + skip_list_iterator tmp = *this; + ++*this; + return tmp; + } + +private: + skip_list_iterator(node_type* n) : my_node_ptr(n) {} + + node_ptr my_node_ptr; + + template + friend class concurrent_skip_list; + + template + friend class skip_list_iterator; + + friend class const_range; + friend class range; + + friend bool operator==( const skip_list_iterator& lhs, const skip_list_iterator& rhs ) { + return lhs.my_node_ptr == rhs.my_node_ptr; + } + + friend bool operator!=( const skip_list_iterator& lhs, const skip_list_iterator& rhs ) { + return lhs.my_node_ptr != rhs.my_node_ptr; + } +}; // class skip_list_iterator + +template +class concurrent_skip_list { +protected: + using container_traits = Traits; + using self_type = concurrent_skip_list; + using allocator_type = typename container_traits::allocator_type; + using allocator_traits_type = tbb::detail::allocator_traits; + using key_compare = typename container_traits::compare_type; + using value_compare = typename container_traits::value_compare; + using key_type = typename container_traits::key_type; + using value_type = typename container_traits::value_type; + static_assert(std::is_same::value, + "value_type of the container should be the same as its allocator"); + + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + static constexpr size_type max_level = container_traits::max_level; + + using node_allocator_type = typename allocator_traits_type::template rebind_alloc; + using node_allocator_traits = tbb::detail::allocator_traits; + + using list_node_type = skip_list_node; + using node_type = d1::node_handle; + + using iterator = skip_list_iterator; + using const_iterator = skip_list_iterator; + + using reference = value_type&; + using const_reference = const value_type&; + using pointer = typename allocator_traits_type::pointer; + using const_pointer = typename allocator_traits_type::const_pointer; + + using random_level_generator_type = typename container_traits::random_level_generator_type; + + using node_ptr = list_node_type*; + + using array_type = std::array; +private: + template + using is_transparent = dependent_bool, T>; +public: + static constexpr bool allow_multimapping = container_traits::allow_multimapping; + + concurrent_skip_list() : my_head_ptr(nullptr), my_size(0), my_max_height(0) {} + + explicit concurrent_skip_list( const key_compare& comp, const allocator_type& alloc = allocator_type() ) + : my_node_allocator(alloc), my_compare(comp), my_head_ptr(nullptr), my_size(0), my_max_height(0) {} + + explicit concurrent_skip_list( const allocator_type& alloc ) + : concurrent_skip_list(key_compare(), alloc) {} + + template + concurrent_skip_list( InputIterator first, InputIterator last, const key_compare& comp = key_compare(), + const allocator_type& alloc = allocator_type() ) + : concurrent_skip_list(comp, alloc) + { + internal_copy(first, last); + } + + template + concurrent_skip_list( InputIterator first, InputIterator last, const allocator_type& alloc ) + : concurrent_skip_list(first, last, key_compare(), alloc) {} + + concurrent_skip_list( std::initializer_list init, const key_compare& comp = key_compare(), + const allocator_type& alloc = allocator_type() ) + : concurrent_skip_list(init.begin(), init.end(), comp, alloc) {} + + concurrent_skip_list( std::initializer_list init, const allocator_type& alloc ) + : concurrent_skip_list(init, key_compare(), alloc) {} + + concurrent_skip_list( const concurrent_skip_list& other ) + : my_node_allocator(node_allocator_traits::select_on_container_copy_construction(other.get_allocator())), + my_compare(other.my_compare), my_rng(other.my_rng), my_head_ptr(nullptr), + my_size(0), my_max_height(0) + { + internal_copy(other); + __TBB_ASSERT(my_size == other.my_size, "Wrong size of copy-constructed container"); + } + + concurrent_skip_list( const concurrent_skip_list& other, const allocator_type& alloc ) + : my_node_allocator(alloc), my_compare(other.my_compare), my_rng(other.my_rng), my_head_ptr(nullptr), + my_size(0), my_max_height(0) + { + internal_copy(other); + __TBB_ASSERT(my_size == other.my_size, "Wrong size of copy-constructed container"); + } + + concurrent_skip_list( concurrent_skip_list&& other ) + : my_node_allocator(std::move(other.my_node_allocator)), my_compare(other.my_compare), + my_rng(std::move(other.my_rng)), my_head_ptr(nullptr) // my_head_ptr would be stored in internal_move + { + internal_move(std::move(other)); + } + + concurrent_skip_list( concurrent_skip_list&& other, const allocator_type& alloc ) + : my_node_allocator(alloc), my_compare(other.my_compare), + my_rng(std::move(other.my_rng)), my_head_ptr(nullptr) + { + using is_always_equal = typename allocator_traits_type::is_always_equal; + internal_move_construct_with_allocator(std::move(other), is_always_equal()); + } + + ~concurrent_skip_list() { + clear(); + delete_head(); + } + + concurrent_skip_list& operator=( const concurrent_skip_list& other ) { + if (this != &other) { + clear(); + copy_assign_allocators(my_node_allocator, other.my_node_allocator); + my_compare = other.my_compare; + my_rng = other.my_rng; + internal_copy(other); + } + return *this; + } + + concurrent_skip_list& operator=( concurrent_skip_list&& other ) { + if (this != &other) { + clear(); + delete_head(); + + my_compare = std::move(other.my_compare); + my_rng = std::move(other.my_rng); + + move_assign_allocators(my_node_allocator, other.my_node_allocator); + using pocma_type = typename node_allocator_traits::propagate_on_container_move_assignment; + using is_always_equal = typename node_allocator_traits::is_always_equal; + internal_move_assign(std::move(other), tbb::detail::disjunction()); + } + return *this; + } + + concurrent_skip_list& operator=( std::initializer_list il ) + { + clear(); + insert(il.begin(),il.end()); + return *this; + } + + std::pair insert( const value_type& value ) { + return internal_insert(value); + } + + std::pair insert( value_type&& value ) { + return internal_insert(std::move(value)); + } + + iterator insert( const_iterator, const_reference value ) { + // Ignore hint + return insert(value).first; + } + + iterator insert( const_iterator, value_type&& value ) { + // Ignore hint + return insert(std::move(value)).first; + } + + template + void insert( InputIterator first, InputIterator last ) { + while (first != last) { + insert(*first); + ++first; + } + } + + void insert( std::initializer_list init ) { + insert(init.begin(), init.end()); + } + + std::pair insert( node_type&& nh ) { + if (!nh.empty()) { + auto insert_node = d1::node_handle_accessor::get_node_ptr(nh); + std::pair insert_result = internal_insert_node(insert_node); + if (insert_result.second) { + d1::node_handle_accessor::deactivate(nh); + } + return insert_result; + } + return std::pair(end(), false); + } + + iterator insert( const_iterator, node_type&& nh ) { + // Ignore hint + return insert(std::move(nh)).first; + } + + template + std::pair emplace( Args&&... args ) { + return internal_insert(std::forward(args)...); + } + + template + iterator emplace_hint( const_iterator, Args&&... args ) { + // Ignore hint + return emplace(std::forward(args)...).first; + } + + iterator unsafe_erase( iterator pos ) { + std::pair extract_result = internal_extract(pos); + if (extract_result.first) { // node was extracted + delete_value_node(extract_result.first); + return extract_result.second; + } + return end(); + } + + iterator unsafe_erase( const_iterator pos ) { + return unsafe_erase(get_iterator(pos)); + } + + iterator unsafe_erase( const_iterator first, const_iterator last ) { + while (first != last) { + // Unsafe erase returns the iterator which follows the erased one + first = unsafe_erase(first); + } + return get_iterator(first); + } + + size_type unsafe_erase( const key_type& key ) { + return internal_erase(key); + } + + template + typename std::enable_if::value + && !std::is_convertible::value + && !std::is_convertible::value, + size_type>::type unsafe_erase( const K& key ) + { + return internal_erase(key); + } + + node_type unsafe_extract( const_iterator pos ) { + std::pair extract_result = internal_extract(pos); + return extract_result.first ? d1::node_handle_accessor::construct(extract_result.first) : node_type(); + } + + node_type unsafe_extract( iterator pos ) { + return unsafe_extract(const_iterator(pos)); + } + + node_type unsafe_extract( const key_type& key ) { + return unsafe_extract(find(key)); + } + + template + typename std::enable_if::value + && !std::is_convertible::value + && !std::is_convertible::value, + node_type>::type unsafe_extract( const K& key ) + { + return unsafe_extract(find(key)); + } + + iterator lower_bound( const key_type& key ) { + return iterator(internal_get_bound(key, my_compare)); + } + + const_iterator lower_bound( const key_type& key ) const { + return const_iterator(internal_get_bound(key, my_compare)); + } + + template + typename std::enable_if::value, iterator>::type lower_bound( const K& key ) { + return iterator(internal_get_bound(key, my_compare)); + } + + template + typename std::enable_if::value, const_iterator>::type lower_bound( const K& key ) const { + return const_iterator(internal_get_bound(key, my_compare)); + } + + iterator upper_bound( const key_type& key ) { + return iterator(internal_get_bound(key, not_greater_compare(my_compare))); + } + + const_iterator upper_bound( const key_type& key ) const { + return const_iterator(internal_get_bound(key, not_greater_compare(my_compare))); + } + + template + typename std::enable_if::value, iterator>::type upper_bound( const K& key ) { + return iterator(internal_get_bound(key, not_greater_compare(my_compare))); + } + + template + typename std::enable_if::value, const_iterator>::type upper_bound( const K& key ) const { + return const_iterator(internal_get_bound(key, not_greater_compare(my_compare))); + } + + iterator find( const key_type& key ) { + return iterator(internal_find(key)); + } + + const_iterator find( const key_type& key ) const { + return const_iterator(internal_find(key)); + } + + template + typename std::enable_if::value, iterator>::type find( const K& key ) { + return iterator(internal_find(key)); + } + + template + typename std::enable_if::value, const_iterator>::type find( const K& key ) const { + return const_iterator(internal_find(key)); + } + + size_type count( const key_type& key ) const { + return internal_count(key); + } + + template + typename std::enable_if::value, size_type>::type count( const K& key ) const { + return internal_count(key); + } + + bool contains( const key_type& key ) const { + return find(key) != end(); + } + + template + typename std::enable_if::value, bool>::type contains( const K& key ) const { + return find(key) != end(); + } + + void clear() noexcept { + // clear is not thread safe - load can be relaxed + node_ptr head = my_head_ptr.load(std::memory_order_relaxed); + + if (head == nullptr) return; // Head is not allocated => container is empty + + node_ptr current = head->next(0); + + // Delete all value nodes in the container + while (current) { + node_ptr next = current->next(0); + delete_value_node(current); + current = next; + } + + for (size_type level = 0; level < head->height(); ++level) { + head->set_next(level, nullptr); + } + + my_size.store(0, std::memory_order_relaxed); + my_max_height.store(0, std::memory_order_relaxed); + } + + iterator begin() { + return iterator(internal_begin()); + } + + const_iterator begin() const { + return const_iterator(internal_begin()); + } + + const_iterator cbegin() const { + return const_iterator(internal_begin()); + } + + iterator end() { + return iterator(nullptr); + } + + const_iterator end() const { + return const_iterator(nullptr); + } + + const_iterator cend() const { + return const_iterator(nullptr); + } + + size_type size() const { + return my_size.load(std::memory_order_relaxed); + } + + size_type max_size() const { + return node_allocator_traits::max_size(my_node_allocator); + } + + __TBB_nodiscard bool empty() const { + return 0 == size(); + } + + allocator_type get_allocator() const { + return my_node_allocator; + } + + void swap(concurrent_skip_list& other) { + if (this != &other) { + using pocs_type = typename node_allocator_traits::propagate_on_container_swap; + using is_always_equal = typename node_allocator_traits::is_always_equal; + internal_swap(other, tbb::detail::disjunction()); + } + } + + std::pair equal_range(const key_type& key) { + return internal_equal_range(key); + } + + std::pair equal_range(const key_type& key) const { + return internal_equal_range(key); + } + + template + typename std::enable_if::value, std::pair>::type equal_range( const K& key ) { + return internal_equal_range(key); + } + + template + typename std::enable_if::value, std::pair>::type equal_range( const K& key ) const { + return internal_equal_range(key); + } + + key_compare key_comp() const { return my_compare; } + + value_compare value_comp() const { return container_traits::value_comp(my_compare); } + + class const_range_type { + public: + using size_type = typename concurrent_skip_list::size_type; + using difference_type = typename concurrent_skip_list::difference_type; + using iterator = typename concurrent_skip_list::const_iterator; + using value_type = typename iterator::value_type; + using reference = typename iterator::reference; + + bool empty() const { + return my_begin.my_node_ptr ? (my_begin.my_node_ptr->next(0) == my_end.my_node_ptr) + : true; + } + + bool is_divisible() const { + return my_begin.my_node_ptr && my_level != 0 + ? my_begin.my_node_ptr->next(my_level - 1) != my_end.my_node_ptr + : false; + } + + size_type size() const { return std::distance(my_begin, my_end); } + + const_range_type( const_range_type& r, split) + : my_end(r.my_end) { + if (r.empty()) { + __TBB_ASSERT(my_end.my_node_ptr == nullptr, nullptr); + my_begin = my_end; + my_level = 0; + } else { + my_begin = iterator(r.my_begin.my_node_ptr->next(r.my_level - 1)); + my_level = my_begin.my_node_ptr->height(); + } + r.my_end = my_begin; + } + + const_range_type( const concurrent_skip_list& l) + : my_end(l.end()), my_begin(l.begin()), + my_level(my_begin.my_node_ptr ? my_begin.my_node_ptr->height() : 0) {} + + iterator begin() const { return my_begin; } + iterator end() const { return my_end; } + size_type grainsize() const { return 1; } + + private: + const_iterator my_end; + const_iterator my_begin; + size_type my_level; + }; // class const_range_type + + class range_type : public const_range_type { + public: + using iterator = typename concurrent_skip_list::iterator; + using value_type = typename iterator::value_type; + using reference = typename iterator::reference; + + range_type(range_type& r, split) : const_range_type(r, split()) {} + range_type(const concurrent_skip_list& l) : const_range_type(l) {} + + iterator begin() const { + node_ptr node = const_range_type::begin().my_node_ptr; + return iterator(node); + } + + iterator end() const { + node_ptr node = const_range_type::end().my_node_ptr; + return iterator(node); + } + }; // class range_type + + range_type range() { return range_type(*this); } + const_range_type range() const { return const_range_type(*this); } + +private: + node_ptr internal_begin() const { + node_ptr head = get_head(); + return head == nullptr ? head : head->next(0); + } + + void internal_move(concurrent_skip_list&& other) { + my_head_ptr.store(other.my_head_ptr.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.my_head_ptr.store(nullptr, std::memory_order_relaxed); + + my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.my_size.store(0, std::memory_order_relaxed); + + my_max_height.store(other.my_max_height.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.my_max_height.store(0, std::memory_order_relaxed); + } + + void internal_move_construct_with_allocator(concurrent_skip_list&& other, + /*is_always_equal = */std::true_type) { + internal_move(std::move(other)); + } + + void internal_move_construct_with_allocator(concurrent_skip_list&& other, + /*is_always_equal = */std::false_type) { + if (my_node_allocator == other.get_allocator()) { + internal_move(std::move(other)); + } else { + my_size.store(0, std::memory_order_relaxed); + my_max_height.store(other.my_max_height.load(std::memory_order_relaxed), std::memory_order_relaxed); + internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end())); + } + } + + static const key_type& get_key( node_ptr n ) { + __TBB_ASSERT(n, nullptr); + return container_traits::get_key(static_cast(n)->value()); + } + + template + bool found( node_ptr node, const K& key ) const { + return node != nullptr && !my_compare(key, get_key(node)); + } + + template + node_ptr internal_find(const K& key) const { + return allow_multimapping ? internal_find_multi(key) : internal_find_unique(key); + } + + template + node_ptr internal_find_multi( const K& key ) const { + node_ptr prev = get_head(); + if (prev == nullptr) return nullptr; // If the head node is not allocated - exit + + node_ptr curr = nullptr; + node_ptr old_curr = curr; + + for (size_type h = my_max_height.load(std::memory_order_acquire); h > 0; --h) { + curr = internal_find_position(h - 1, prev, key, my_compare); + + if (curr != old_curr && found(curr, key)) { + return curr; + } + old_curr = curr; + } + return nullptr; + } + + template + node_ptr internal_find_unique( const K& key ) const { + const_iterator it = lower_bound(key); + return (it == end() || my_compare(key, container_traits::get_key(*it))) ? nullptr : it.my_node_ptr; + } + + template + size_type internal_count( const K& key ) const { + if (allow_multimapping) { + // TODO: reimplement without double traversal + std::pair r = equal_range(key); + return std::distance(r.first, r.second); + } + return size_type(contains(key) ? 1 : 0); + } + + template + std::pair internal_equal_range(const K& key) const { + iterator lb = get_iterator(lower_bound(key)); + auto result = std::make_pair(lb, lb); + + // If the lower bound points to the node with the requested key + if (found(lb.my_node_ptr, key)) { + + if (!allow_multimapping) { + // For unique containers - move the second iterator forward and exit + ++result.second; + } else { + // For multi containers - find the upper bound starting from the lower bound + node_ptr prev = lb.my_node_ptr; + node_ptr curr = nullptr; + not_greater_compare cmp(my_compare); + + // Start from the lower bound of the range + for (size_type h = prev->height(); h > 0; --h) { + curr = prev->next(h - 1); + while (curr && cmp(get_key(curr), key)) { + prev = curr; + // If the height of the next node is greater than the current one - jump to its height + if (h < curr->height()) { + h = curr->height(); + } + curr = prev->next(h - 1); + } + } + result.second = iterator(curr); + } + } + + return result; + } + + // Finds position on the level using comparator cmp starting from the node prev + template + node_ptr internal_find_position( size_type level, node_ptr& prev, const K& key, + const Comparator& cmp ) const { + __TBB_ASSERT(level < prev->height(), "Wrong level to find position"); + node_ptr curr = prev->next(level); + + while (curr && cmp(get_key(curr), key)) { + prev = curr; + __TBB_ASSERT(level < prev->height(), nullptr); + curr = prev->next(level); + } + + return curr; + } + + // The same as previous overload, but allows index_number comparison + template + node_ptr internal_find_position( size_type level, node_ptr& prev, node_ptr node, + const Comparator& cmp ) const { + __TBB_ASSERT(level < prev->height(), "Wrong level to find position"); + node_ptr curr = prev->next(level); + + while (curr && cmp(get_key(curr), get_key(node))) { + if (allow_multimapping && cmp(get_key(node), get_key(curr)) && curr->index_number() > node->index_number()) { + break; + } + + prev = curr; + __TBB_ASSERT(level < prev->height(), nullptr); + curr = prev->next(level); + } + return curr; + } + + template + void fill_prev_curr_arrays(array_type& prev_nodes, array_type& curr_nodes, node_ptr node, const key_type& key, + const Comparator& cmp, node_ptr head ) { + + size_type curr_max_height = my_max_height.load(std::memory_order_acquire); + size_type node_height = node->height(); + if (curr_max_height < node_height) { + std::fill(prev_nodes.begin() + curr_max_height, prev_nodes.begin() + node_height, head); + std::fill(curr_nodes.begin() + curr_max_height, curr_nodes.begin() + node_height, nullptr); + } + + node_ptr prev = head; + for (size_type level = curr_max_height; level > 0; --level) { + node_ptr curr = internal_find_position(level - 1, prev, key, cmp); + prev_nodes[level - 1] = prev; + curr_nodes[level - 1] = curr; + } + } + + void fill_prev_array_for_existing_node( array_type& prev_nodes, node_ptr node ) { + node_ptr head = create_head_if_necessary(); + prev_nodes.fill(head); + + node_ptr prev = head; + for (size_type level = node->height(); level > 0; --level) { + while (prev->next(level - 1) != node) { + prev = prev->next(level - 1); + } + prev_nodes[level - 1] = prev; + } + } + + struct not_greater_compare { + const key_compare& my_less_compare; + + not_greater_compare( const key_compare& less_compare ) : my_less_compare(less_compare) {} + + template + bool operator()( const K1& first, const K2& second ) const { + return !my_less_compare(second, first); + } + }; + + not_greater_compare select_comparator( /*allow_multimapping = */ std::true_type ) { + return not_greater_compare(my_compare); + } + + key_compare select_comparator( /*allow_multimapping = */ std::false_type ) { + return my_compare; + } + + template + std::pair internal_insert( Args&&... args ) { + node_ptr new_node = create_value_node(std::forward(args)...); + std::pair insert_result = internal_insert_node(new_node); + if (!insert_result.second) { + delete_value_node(new_node); + } + return insert_result; + } + + std::pair internal_insert_node( node_ptr new_node ) { + array_type prev_nodes; + array_type curr_nodes; + size_type new_height = new_node->height(); + auto compare = select_comparator(std::integral_constant{}); + + node_ptr head_node = create_head_if_necessary(); + + for (;;) { + fill_prev_curr_arrays(prev_nodes, curr_nodes, new_node, get_key(new_node), compare, head_node); + + node_ptr prev = prev_nodes[0]; + node_ptr next = curr_nodes[0]; + + if (allow_multimapping) { + new_node->set_index_number(prev->index_number() + 1); + } else { + if (found(next, get_key(new_node))) { + return std::pair(iterator(next), false); + } + } + + new_node->set_next(0, next); + if (!prev->atomic_next(0).compare_exchange_strong(next, new_node)) { + continue; + } + + // If the node was successfully linked on the first level - it will be linked on other levels + // Insertion cannot fail starting from this point + + // If the height of inserted node is greater than maximum - increase maximum + size_type max_height = my_max_height.load(std::memory_order_acquire); + for (;;) { + if (new_height <= max_height || my_max_height.compare_exchange_strong(max_height, new_height)) { + // If the maximum was successfully updated by current thread + // or by an other thread for the value, greater or equal to new_height + break; + } + } + + for (std::size_t level = 1; level < new_height; ++level) { + // Link the node on upper levels + for (;;) { + prev = prev_nodes[level]; + next = static_cast(curr_nodes[level]); + + new_node->set_next(level, next); + __TBB_ASSERT(new_node->height() > level, "Internal structure break"); + if (prev->atomic_next(level).compare_exchange_strong(next, new_node)) { + break; + } + + for (size_type lev = level; lev != new_height; ++lev ) { + curr_nodes[lev] = internal_find_position(lev, prev_nodes[lev], new_node, compare); + } + } + } + ++my_size; + return std::pair(iterator(new_node), true); + } + } + + template + node_ptr internal_get_bound( const K& key, const Comparator& cmp ) const { + node_ptr prev = get_head(); + if (prev == nullptr) return nullptr; // If the head node is not allocated - exit + + node_ptr curr = nullptr; + + for (size_type h = my_max_height.load(std::memory_order_acquire); h > 0; --h) { + curr = internal_find_position(h - 1, prev, key, cmp); + } + + return curr; + } + + template + size_type internal_erase( const K& key ) { + auto eq = equal_range(key); + size_type old_size = size(); + unsafe_erase(eq.first, eq.second); + return old_size - size(); + } + + // Returns node_ptr to the extracted node and node_ptr to the next node after the extracted + std::pair internal_extract( const_iterator it ) { + std::pair result(nullptr, nullptr); + if ( it != end() ) { + array_type prev_nodes; + + node_ptr erase_node = it.my_node_ptr; + node_ptr next_node = erase_node->next(0); + fill_prev_array_for_existing_node(prev_nodes, erase_node); + + for (size_type level = 0; level < erase_node->height(); ++level) { + prev_nodes[level]->set_next(level, erase_node->next(level)); + erase_node->set_next(level, nullptr); + } + my_size.fetch_sub(1, std::memory_order_relaxed); + + result.first = erase_node; + result.second = next_node; + } + return result; + } + +protected: + template + void internal_merge( SourceType&& source ) { + using source_type = typename std::decay::type; + using source_iterator = typename source_type::iterator; + static_assert((std::is_same::value), "Incompatible containers cannot be merged"); + + for (source_iterator it = source.begin(); it != source.end();) { + source_iterator where = it++; + if (allow_multimapping || !contains(container_traits::get_key(*where))) { + node_type handle = source.unsafe_extract(where); + __TBB_ASSERT(!handle.empty(), "Extracted handle in merge is empty"); + + if (!insert(std::move(handle)).second) { + __TBB_ASSERT(!handle.empty(), "Handle should not be empty if insert fails"); + //If the insertion fails - return the node into source + source.insert(std::move(handle)); + } + __TBB_ASSERT(handle.empty(), "Node handle should be empty after the insertion"); + } + } + } + +private: + void internal_copy( const concurrent_skip_list& other ) { + internal_copy(other.begin(), other.end()); + } + + template + void internal_copy( Iterator first, Iterator last ) { + try_call([&] { + for (auto it = first; it != last; ++it) { + insert(*it); + } + }).on_exception([&] { + clear(); + delete_head(); + }); + } + + node_ptr create_node( size_type height ) { + return list_node_type::create(my_node_allocator, height); + } + + template + node_ptr create_value_node( Args&&... args ) { + node_ptr node = create_node(my_rng()); + + // try_call API is not convenient here due to broken + // variadic capture on GCC 4.8.5 + auto value_guard = make_raii_guard([&] { + delete_node(node); + }); + + // Construct the value inside the node + node_allocator_traits::construct(my_node_allocator, node->storage(), std::forward(args)...); + value_guard.dismiss(); + return node; + } + + node_ptr create_head_node() { + return create_node(max_level); + } + + void delete_head() { + node_ptr head = my_head_ptr.load(std::memory_order_relaxed); + if (head != nullptr) { + delete_node(head); + my_head_ptr.store(nullptr, std::memory_order_relaxed); + } + } + + void delete_node( node_ptr node ) { + list_node_type::destroy(my_node_allocator, node); + } + + void delete_value_node( node_ptr node ) { + // Destroy the value inside the node + node_allocator_traits::destroy(my_node_allocator, node->storage()); + delete_node(node); + } + + node_ptr get_head() const { + return my_head_ptr.load(std::memory_order_acquire); + } + + node_ptr create_head_if_necessary() { + node_ptr current_head = get_head(); + if (current_head == nullptr) { + // Head node was not created - create it + node_ptr new_head = create_head_node(); + if (my_head_ptr.compare_exchange_strong(current_head, new_head)) { + current_head = new_head; + } else { + // If an other thread has already created the head node - destroy new_head + // current_head now points to the actual head node + delete_node(new_head); + } + } + __TBB_ASSERT(my_head_ptr.load(std::memory_order_relaxed) != nullptr, nullptr); + __TBB_ASSERT(current_head != nullptr, nullptr); + return current_head; + } + + static iterator get_iterator( const_iterator it ) { + return iterator(it.my_node_ptr); + } + + void internal_move_assign( concurrent_skip_list&& other, /*POCMA || is_always_equal =*/std::true_type ) { + internal_move(std::move(other)); + } + + void internal_move_assign( concurrent_skip_list&& other, /*POCMA || is_always_equal =*/std::false_type ) { + if (my_node_allocator == other.my_node_allocator) { + internal_move(std::move(other)); + } else { + internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end())); + } + } + + void internal_swap_fields( concurrent_skip_list& other ) { + using std::swap; + swap_allocators(my_node_allocator, other.my_node_allocator); + swap(my_compare, other.my_compare); + swap(my_rng, other.my_rng); + + swap_atomics_relaxed(my_head_ptr, other.my_head_ptr); + swap_atomics_relaxed(my_size, other.my_size); + swap_atomics_relaxed(my_max_height, other.my_max_height); + } + + void internal_swap( concurrent_skip_list& other, /*POCMA || is_always_equal =*/std::true_type ) { + internal_swap_fields(other); + } + + void internal_swap( concurrent_skip_list& other, /*POCMA || is_always_equal =*/std::false_type ) { + __TBB_ASSERT(my_node_allocator == other.my_node_allocator, "Swapping with unequal allocators is not allowed"); + internal_swap_fields(other); + } + + node_allocator_type my_node_allocator; + key_compare my_compare; + random_level_generator_type my_rng; + std::atomic my_head_ptr; + std::atomic my_size; + std::atomic my_max_height; + + template + friend class concurrent_skip_list; +}; // class concurrent_skip_list + +template +bool operator==( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { + if (lhs.size() != rhs.size()) return false; +#if _MSC_VER + // Passing "unchecked" iterators to std::equal with 3 parameters + // causes compiler warnings. + // The workaround is to use overload with 4 parameters, which is + // available since C++14 - minimally supported version on MSVC + return std::equal(lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); +#else + return std::equal(lhs.begin(), lhs.end(), rhs.begin()); +#endif +} + +#if !__TBB_CPP20_COMPARISONS_PRESENT +template +bool operator!=( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { + return !(lhs == rhs); +} +#endif + +#if __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT +template +tbb::detail::synthesized_three_way_result +operator<=>( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { + return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(), + rhs.begin(), rhs.end(), + tbb::detail::synthesized_three_way_comparator{}); +} +#else +template +bool operator<( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { + return std::lexicographical_compare(lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); +} + +template +bool operator>( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { + return rhs < lhs; +} + +template +bool operator<=( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { + return !(rhs < lhs); +} + +template +bool operator>=( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { + return !(lhs < rhs); +} +#endif // __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT + +// Generates a number from the interval [0, MaxLevel). +template +class concurrent_geometric_level_generator { +public: + static constexpr std::size_t max_level = MaxLevel; + // TODO: modify the algorithm to accept other values of max_level + static_assert(max_level == 32, "Incompatible max_level for rng"); + + concurrent_geometric_level_generator() : engines(std::minstd_rand::result_type(time(nullptr))) {} + + std::size_t operator()() { + // +1 is required to pass at least 1 into log2 (log2(0) is undefined) + // -1 is required to have an ability to return 0 from the generator (max_level - log2(2^31) - 1) + std::size_t result = max_level - std::size_t(tbb::detail::log2(engines.local()() + 1)) - 1; + __TBB_ASSERT(result <= max_level, nullptr); + return result; + } + +private: + tbb::enumerable_thread_specific engines; +}; + +} // namespace d2 + +} // namespace detail +} // namespace tbb + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma warning(pop) // warning 4127 is back +#endif + +#endif // __TBB_detail__concurrent_skip_list_H diff --git a/third_party/tbb/detail/_concurrent_unordered_base.h b/third_party/tbb/detail/_concurrent_unordered_base.h new file mode 100644 index 000000000..9dd0ad499 --- /dev/null +++ b/third_party/tbb/detail/_concurrent_unordered_base.h @@ -0,0 +1,1515 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__concurrent_unordered_base_H +#define __TBB_detail__concurrent_unordered_base_H + +#if !defined(__TBB_concurrent_unordered_map_H) && !defined(__TBB_concurrent_unordered_set_H) +#error Do not #include this internal file directly; use public TBB headers instead. +#endif + +#include "third_party/tbb/detail/_range_common.h" +#include "third_party/tbb/detail/_containers_helpers.h" +#include "third_party/tbb/detail/_segment_table.h" +#include "third_party/tbb/detail/_hash_compare.h" +#include "third_party/tbb/detail/_allocator_traits.h" +#include "third_party/tbb/detail/_node_handle.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/libcxx/iterator" +#include "third_party/libcxx/utility" +#include "third_party/libcxx/functional" +#include "third_party/libcxx/initializer_list" +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/type_traits" +#include "third_party/libcxx/memory" +#include "third_party/libcxx/algorithm" + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma warning(push) +#pragma warning(disable: 4127) // warning C4127: conditional expression is constant +#endif + +namespace tbb { +namespace detail { +namespace d1 { + +template +class concurrent_unordered_base; + +template +class solist_iterator { +private: + using node_ptr = typename Container::value_node_ptr; + template + friend class split_ordered_list; + template + friend class solist_iterator; + template + friend class concurrent_unordered_base; + template + friend bool operator==( const solist_iterator& i, const solist_iterator& j ); + template + friend bool operator!=( const solist_iterator& i, const solist_iterator& j ); +public: + using value_type = Value; + using difference_type = typename Container::difference_type; + using pointer = value_type*; + using reference = value_type&; + using iterator_category = std::forward_iterator_tag; + + solist_iterator() : my_node_ptr(nullptr) {} + solist_iterator( const solist_iterator& other ) + : my_node_ptr(other.my_node_ptr) {} + + solist_iterator& operator=( const solist_iterator& other ) { + my_node_ptr = other.my_node_ptr; + return *this; + } + + reference operator*() const { + return my_node_ptr->value(); + } + + pointer operator->() const { + return my_node_ptr->storage(); + } + + solist_iterator& operator++() { + auto next_node = my_node_ptr->next(); + while(next_node && next_node->is_dummy()) { + next_node = next_node->next(); + } + my_node_ptr = static_cast(next_node); + return *this; + } + + solist_iterator operator++(int) { + solist_iterator tmp = *this; + ++*this; + return tmp; + } + +private: + solist_iterator( node_ptr pnode ) : my_node_ptr(pnode) {} + + node_ptr get_node_ptr() const { return my_node_ptr; } + + node_ptr my_node_ptr; +}; + +template +bool operator==( const solist_iterator& i, const solist_iterator& j ) { + return i.my_node_ptr == j.my_node_ptr; +} + +template +bool operator!=( const solist_iterator& i, const solist_iterator& j ) { + return i.my_node_ptr != j.my_node_ptr; +} + +template +class list_node { +public: + using node_ptr = list_node*; + using sokey_type = SokeyType; + + list_node(sokey_type key) : my_next(nullptr), my_order_key(key) {} + + void init( sokey_type key ) { + my_order_key = key; + } + + sokey_type order_key() const { + return my_order_key; + } + + bool is_dummy() { + // The last bit of order key is unset for dummy nodes + return (my_order_key & 0x1) == 0; + } + + node_ptr next() const { + return my_next.load(std::memory_order_acquire); + } + + void set_next( node_ptr next_node ) { + my_next.store(next_node, std::memory_order_release); + } + + bool try_set_next( node_ptr expected_next, node_ptr new_next ) { + return my_next.compare_exchange_strong(expected_next, new_next); + } + +private: + std::atomic my_next; + sokey_type my_order_key; +}; // class list_node + +template +class value_node : public list_node +{ +public: + using base_type = list_node; + using sokey_type = typename base_type::sokey_type; + using value_type = ValueType; + + value_node( sokey_type ord_key ) : base_type(ord_key) {} + ~value_node() {} + value_type* storage() { + return reinterpret_cast(&my_value); + } + + value_type& value() { + return *storage(); + } + +private: + using aligned_storage_type = typename std::aligned_storage::type; + aligned_storage_type my_value; +}; // class value_node + +template +class concurrent_unordered_base { + using self_type = concurrent_unordered_base; + using traits_type = Traits; + using hash_compare_type = typename traits_type::hash_compare_type; + class unordered_segment_table; +public: + using value_type = typename traits_type::value_type; + using key_type = typename traits_type::key_type; + using allocator_type = typename traits_type::allocator_type; + +private: + using allocator_traits_type = tbb::detail::allocator_traits; + // TODO: check assert conditions for different C++ standards + static_assert(std::is_same::value, + "value_type of the container must be the same as its allocator"); + using sokey_type = std::size_t; + +public: + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + using iterator = solist_iterator; + using const_iterator = solist_iterator; + using local_iterator = iterator; + using const_local_iterator = const_iterator; + + using reference = value_type&; + using const_reference = const value_type&; + using pointer = typename allocator_traits_type::pointer; + using const_pointer = typename allocator_traits_type::const_pointer; + + using hasher = typename hash_compare_type::hasher; + using key_equal = typename hash_compare_type::key_equal; + +private: + using list_node_type = list_node; + using value_node_type = value_node; + using node_ptr = list_node_type*; + using value_node_ptr = value_node_type*; + + using value_node_allocator_type = typename allocator_traits_type::template rebind_alloc; + using node_allocator_type = typename allocator_traits_type::template rebind_alloc; + + using node_allocator_traits = tbb::detail::allocator_traits; + using value_node_allocator_traits = tbb::detail::allocator_traits; + + static constexpr size_type round_up_to_power_of_two( size_type bucket_count ) { + return size_type(1) << size_type(tbb::detail::log2(uintptr_t(bucket_count == 0 ? 1 : bucket_count) * 2 - 1)); + } + + template + using is_transparent = dependent_bool, T>; +public: + using node_type = node_handle; + + explicit concurrent_unordered_base( size_type bucket_count, const hasher& hash = hasher(), + const key_equal& equal = key_equal(), const allocator_type& alloc = allocator_type() ) + : my_size(0), + my_bucket_count(round_up_to_power_of_two(bucket_count)), + my_max_load_factor(float(initial_max_load_factor)), + my_hash_compare(hash, equal), + my_head(sokey_type(0)), + my_segments(alloc) {} + + concurrent_unordered_base() : concurrent_unordered_base(initial_bucket_count) {} + + concurrent_unordered_base( size_type bucket_count, const allocator_type& alloc ) + : concurrent_unordered_base(bucket_count, hasher(), key_equal(), alloc) {} + + concurrent_unordered_base( size_type bucket_count, const hasher& hash, const allocator_type& alloc ) + : concurrent_unordered_base(bucket_count, hash, key_equal(), alloc) {} + + explicit concurrent_unordered_base( const allocator_type& alloc ) + : concurrent_unordered_base(initial_bucket_count, hasher(), key_equal(), alloc) {} + + template + concurrent_unordered_base( InputIterator first, InputIterator last, + size_type bucket_count = initial_bucket_count, const hasher& hash = hasher(), + const key_equal& equal = key_equal(), const allocator_type& alloc = allocator_type() ) + : concurrent_unordered_base(bucket_count, hash, equal, alloc) + { + insert(first, last); + } + + template + concurrent_unordered_base( InputIterator first, InputIterator last, + size_type bucket_count, const allocator_type& alloc ) + : concurrent_unordered_base(first, last, bucket_count, hasher(), key_equal(), alloc) {} + + template + concurrent_unordered_base( InputIterator first, InputIterator last, + size_type bucket_count, const hasher& hash, const allocator_type& alloc ) + : concurrent_unordered_base(first, last, bucket_count, hash, key_equal(), alloc) {} + + concurrent_unordered_base( const concurrent_unordered_base& other ) + : my_size(other.my_size.load(std::memory_order_relaxed)), + my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)), + my_max_load_factor(other.my_max_load_factor), + my_hash_compare(other.my_hash_compare), + my_head(other.my_head.order_key()), + my_segments(other.my_segments) + { + try_call( [&] { + internal_copy(other); + } ).on_exception( [&] { + clear(); + }); + } + + concurrent_unordered_base( const concurrent_unordered_base& other, const allocator_type& alloc ) + : my_size(other.my_size.load(std::memory_order_relaxed)), + my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)), + my_max_load_factor(other.my_max_load_factor), + my_hash_compare(other.my_hash_compare), + my_head(other.my_head.order_key()), + my_segments(other.my_segments, alloc) + { + try_call( [&] { + internal_copy(other); + } ).on_exception( [&] { + clear(); + }); + } + + concurrent_unordered_base( concurrent_unordered_base&& other ) + : my_size(other.my_size.load(std::memory_order_relaxed)), + my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)), + my_max_load_factor(std::move(other.my_max_load_factor)), + my_hash_compare(std::move(other.my_hash_compare)), + my_head(other.my_head.order_key()), + my_segments(std::move(other.my_segments)) + { + move_content(std::move(other)); + } + + concurrent_unordered_base( concurrent_unordered_base&& other, const allocator_type& alloc ) + : my_size(other.my_size.load(std::memory_order_relaxed)), + my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)), + my_max_load_factor(std::move(other.my_max_load_factor)), + my_hash_compare(std::move(other.my_hash_compare)), + my_head(other.my_head.order_key()), + my_segments(std::move(other.my_segments), alloc) + { + using is_always_equal = typename allocator_traits_type::is_always_equal; + internal_move_construct_with_allocator(std::move(other), alloc, is_always_equal()); + } + + concurrent_unordered_base( std::initializer_list init, + size_type bucket_count = initial_bucket_count, + const hasher& hash = hasher(), const key_equal& equal = key_equal(), + const allocator_type& alloc = allocator_type() ) + : concurrent_unordered_base(init.begin(), init.end(), bucket_count, hash, equal, alloc) {} + + concurrent_unordered_base( std::initializer_list init, + size_type bucket_count, const allocator_type& alloc ) + : concurrent_unordered_base(init, bucket_count, hasher(), key_equal(), alloc) {} + + concurrent_unordered_base( std::initializer_list init, + size_type bucket_count, const hasher& hash, const allocator_type& alloc ) + : concurrent_unordered_base(init, bucket_count, hash, key_equal(), alloc) {} + + ~concurrent_unordered_base() { + internal_clear(); + } + + concurrent_unordered_base& operator=( const concurrent_unordered_base& other ) { + if (this != &other) { + clear(); + my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); + my_bucket_count.store(other.my_bucket_count.load(std::memory_order_relaxed), std::memory_order_relaxed); + my_max_load_factor = other.my_max_load_factor; + my_hash_compare = other.my_hash_compare; + my_segments = other.my_segments; + internal_copy(other); // TODO: guards for exceptions? + } + return *this; + } + + concurrent_unordered_base& operator=( concurrent_unordered_base&& other ) noexcept(unordered_segment_table::is_noexcept_assignment) { + if (this != &other) { + clear(); + my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); + my_bucket_count.store(other.my_bucket_count.load(std::memory_order_relaxed), std::memory_order_relaxed); + my_max_load_factor = std::move(other.my_max_load_factor); + my_hash_compare = std::move(other.my_hash_compare); + my_segments = std::move(other.my_segments); + + using pocma_type = typename allocator_traits_type::propagate_on_container_move_assignment; + using is_always_equal = typename allocator_traits_type::is_always_equal; + internal_move_assign(std::move(other), tbb::detail::disjunction()); + } + return *this; + } + + concurrent_unordered_base& operator=( std::initializer_list init ) { + clear(); + insert(init); + return *this; + } + + void swap( concurrent_unordered_base& other ) noexcept(unordered_segment_table::is_noexcept_swap) { + if (this != &other) { + using pocs_type = typename allocator_traits_type::propagate_on_container_swap; + using is_always_equal = typename allocator_traits_type::is_always_equal; + internal_swap(other, tbb::detail::disjunction()); + } + } + + allocator_type get_allocator() const noexcept { return my_segments.get_allocator(); } + + iterator begin() noexcept { return iterator(first_value_node(&my_head)); } + const_iterator begin() const noexcept { return const_iterator(first_value_node(const_cast(&my_head))); } + const_iterator cbegin() const noexcept { return const_iterator(first_value_node(const_cast(&my_head))); } + + iterator end() noexcept { return iterator(nullptr); } + const_iterator end() const noexcept { return const_iterator(nullptr); } + const_iterator cend() const noexcept { return const_iterator(nullptr); } + + __TBB_nodiscard bool empty() const noexcept { return size() == 0; } + size_type size() const noexcept { return my_size.load(std::memory_order_relaxed); } + size_type max_size() const noexcept { return allocator_traits_type::max_size(get_allocator()); } + + void clear() noexcept { + internal_clear(); + } + + std::pair insert( const value_type& value ) { + return internal_insert_value(value); + } + + std::pair insert( value_type&& value ) { + return internal_insert_value(std::move(value)); + } + + iterator insert( const_iterator, const value_type& value ) { + // Ignore hint + return insert(value).first; + } + + iterator insert( const_iterator, value_type&& value ) { + // Ignore hint + return insert(std::move(value)).first; + } + + template + void insert( InputIterator first, InputIterator last ) { + for (; first != last; ++first) { + insert(*first); + } + } + + void insert( std::initializer_list init ) { + insert(init.begin(), init.end()); + } + + std::pair insert( node_type&& nh ) { + if (!nh.empty()) { + value_node_ptr insert_node = node_handle_accessor::get_node_ptr(nh); + auto init_node = [&insert_node]( sokey_type order_key )->value_node_ptr { + insert_node->init(order_key); + return insert_node; + }; + auto insert_result = internal_insert(insert_node->value(), init_node); + if (insert_result.inserted) { + // If the insertion succeeded - set node handle to the empty state + __TBB_ASSERT(insert_result.remaining_node == nullptr, + "internal_insert_node should not return the remaining node if the insertion succeeded"); + node_handle_accessor::deactivate(nh); + } + return { iterator(insert_result.node_with_equal_key), insert_result.inserted }; + } + return {end(), false}; + } + + iterator insert( const_iterator, node_type&& nh ) { + // Ignore hint + return insert(std::move(nh)).first; + } + + template + std::pair emplace( Args&&... args ) { + // Create a node with temporary order_key 0, which will be reinitialize + // in internal_insert after the hash calculation + value_node_ptr insert_node = create_node(0, std::forward(args)...); + + auto init_node = [&insert_node]( sokey_type order_key )->value_node_ptr { + insert_node->init(order_key); + return insert_node; + }; + + auto insert_result = internal_insert(insert_node->value(), init_node); + + if (!insert_result.inserted) { + // If the insertion failed - destroy the node which was created + insert_node->init(split_order_key_regular(1)); + destroy_node(insert_node); + } + + return { iterator(insert_result.node_with_equal_key), insert_result.inserted }; + } + + template + iterator emplace_hint( const_iterator, Args&&... args ) { + // Ignore hint + return emplace(std::forward(args)...).first; + } + + iterator unsafe_erase( const_iterator pos ) { + return iterator(first_value_node(internal_erase(pos.get_node_ptr()))); + } + + iterator unsafe_erase( iterator pos ) { + return iterator(first_value_node(internal_erase(pos.get_node_ptr()))); + } + + iterator unsafe_erase( const_iterator first, const_iterator last ) { + while(first != last) { + first = unsafe_erase(first); + } + return iterator(first.get_node_ptr()); + } + + size_type unsafe_erase( const key_type& key ) { + return internal_erase_by_key(key); + } + + template + typename std::enable_if::value + && !std::is_convertible::value + && !std::is_convertible::value, + size_type>::type unsafe_erase( const K& key ) + { + return internal_erase_by_key(key); + } + + node_type unsafe_extract( const_iterator pos ) { + internal_extract(pos.get_node_ptr()); + return node_handle_accessor::construct(pos.get_node_ptr()); + } + + node_type unsafe_extract( iterator pos ) { + internal_extract(pos.get_node_ptr()); + return node_handle_accessor::construct(pos.get_node_ptr()); + } + + node_type unsafe_extract( const key_type& key ) { + iterator item = find(key); + return item == end() ? node_type() : unsafe_extract(item); + } + + template + typename std::enable_if::value + && !std::is_convertible::value + && !std::is_convertible::value, + node_type>::type unsafe_extract( const K& key ) + { + iterator item = find(key); + return item == end() ? node_type() : unsafe_extract(item); + } + + // Lookup functions + iterator find( const key_type& key ) { + value_node_ptr result = internal_find(key); + return result == nullptr ? end() : iterator(result); + } + + const_iterator find( const key_type& key ) const { + value_node_ptr result = const_cast(this)->internal_find(key); + return result == nullptr ? end() : const_iterator(result); + } + + template + typename std::enable_if::value, iterator>::type find( const K& key ) { + value_node_ptr result = internal_find(key); + return result == nullptr ? end() : iterator(result); + } + + template + typename std::enable_if::value, const_iterator>::type find( const K& key ) const { + value_node_ptr result = const_cast(this)->internal_find(key); + return result == nullptr ? end() : const_iterator(result); + } + + std::pair equal_range( const key_type& key ) { + auto result = internal_equal_range(key); + return std::make_pair(iterator(result.first), iterator(result.second)); + } + + std::pair equal_range( const key_type& key ) const { + auto result = const_cast(this)->internal_equal_range(key); + return std::make_pair(const_iterator(result.first), const_iterator(result.second)); + } + + template + typename std::enable_if::value, std::pair>::type equal_range( const K& key ) { + auto result = internal_equal_range(key); + return std::make_pair(iterator(result.first), iterator(result.second)); + } + + template + typename std::enable_if::value, std::pair>::type equal_range( const K& key ) const { + auto result = const_cast(this)->internal_equal_range(key); + return std::make_pair(iterator(result.first), iterator(result.second)); + } + + size_type count( const key_type& key ) const { + return internal_count(key); + } + + template + typename std::enable_if::value, size_type>::type count( const K& key ) const { + return internal_count(key); + } + + bool contains( const key_type& key ) const { + return find(key) != end(); + } + + template + typename std::enable_if::value, bool>::type contains( const K& key ) const { + return find(key) != end(); + } + + // Bucket interface + local_iterator unsafe_begin( size_type n ) { + return local_iterator(first_value_node(get_bucket(n))); + } + + const_local_iterator unsafe_begin( size_type n ) const { + auto bucket_begin = first_value_node(const_cast(this)->get_bucket(n)); + return const_local_iterator(bucket_begin); + } + + const_local_iterator unsafe_cbegin( size_type n ) const { + auto bucket_begin = first_value_node(const_cast(this)->get_bucket(n)); + return const_local_iterator(bucket_begin); + } + + local_iterator unsafe_end( size_type n ) { + size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed); + return n != bucket_count - 1 ? unsafe_begin(get_next_bucket_index(n)) : local_iterator(nullptr); + } + + const_local_iterator unsafe_end( size_type n ) const { + size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed); + return n != bucket_count - 1 ? unsafe_begin(get_next_bucket_index(n)) : const_local_iterator(nullptr); + } + + const_local_iterator unsafe_cend( size_type n ) const { + size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed); + return n != bucket_count - 1 ? unsafe_begin(get_next_bucket_index(n)) : const_local_iterator(nullptr); + } + + size_type unsafe_bucket_count() const { return my_bucket_count.load(std::memory_order_relaxed); } + + size_type unsafe_max_bucket_count() const { + return max_size(); + } + + size_type unsafe_bucket_size( size_type n ) const { + return size_type(std::distance(unsafe_begin(n), unsafe_end(n))); + } + + size_type unsafe_bucket( const key_type& key ) const { + return my_hash_compare(key) % my_bucket_count.load(std::memory_order_relaxed); + } + + // Hash policy + float load_factor() const { + return float(size() / float(my_bucket_count.load(std::memory_order_acquire))); + } + + float max_load_factor() const { return my_max_load_factor; } + + void max_load_factor( float mlf ) { + if (mlf != mlf || mlf < 0) { + tbb::detail::throw_exception(exception_id::invalid_load_factor); + } + my_max_load_factor = mlf; + } // TODO: unsafe? + + void rehash( size_type bucket_count ) { + size_type current_bucket_count = my_bucket_count.load(std::memory_order_acquire); + if (current_bucket_count < bucket_count) { + // TODO: do we need do-while here? + my_bucket_count.compare_exchange_strong(current_bucket_count, round_up_to_power_of_two(bucket_count)); + } + } + + void reserve( size_type elements_count ) { + size_type current_bucket_count = my_bucket_count.load(std::memory_order_acquire); + size_type necessary_bucket_count = current_bucket_count; + + // max_load_factor() is currently unsafe, so we can assume that my_max_load_factor + // would not be changed during the calculation + // TODO: Log2 seems useful here + while (necessary_bucket_count * max_load_factor() < elements_count) { + necessary_bucket_count <<= 1; + } + + while (!my_bucket_count.compare_exchange_strong(current_bucket_count, necessary_bucket_count)) { + if (current_bucket_count >= necessary_bucket_count) + break; + } + } + + // Observers + hasher hash_function() const { return my_hash_compare.hash_function(); } + key_equal key_eq() const { return my_hash_compare.key_eq(); } + + class const_range_type { + private: + const concurrent_unordered_base& my_instance; + node_ptr my_begin_node; // may be node* const + node_ptr my_end_node; + mutable node_ptr my_midpoint_node; + public: + using size_type = typename concurrent_unordered_base::size_type; + using value_type = typename concurrent_unordered_base::value_type; + using reference = typename concurrent_unordered_base::reference; + using difference_type = typename concurrent_unordered_base::difference_type; + using iterator = typename concurrent_unordered_base::const_iterator; + + bool empty() const { return my_begin_node == my_end_node; } + + bool is_divisible() const { + return my_midpoint_node != my_end_node; + } + + size_type grainsize() const { return 1; } + + const_range_type( const_range_type& range, split ) + : my_instance(range.my_instance), + my_begin_node(range.my_midpoint_node), + my_end_node(range.my_end_node) + { + range.my_end_node = my_begin_node; + __TBB_ASSERT(!empty(), "Splitting despite the range is not divisible"); + __TBB_ASSERT(!range.empty(), "Splitting despite the range is not divisible"); + set_midpoint(); + range.set_midpoint(); + } + + iterator begin() const { return iterator(my_instance.first_value_node(my_begin_node)); } + iterator end() const { return iterator(my_instance.first_value_node(my_end_node)); } + + const_range_type( const concurrent_unordered_base& table ) + : my_instance(table), my_begin_node(my_instance.first_value_node(const_cast(&table.my_head))), my_end_node(nullptr) + { + set_midpoint(); + } + private: + void set_midpoint() const { + if (empty()) { + my_midpoint_node = my_end_node; + } else { + sokey_type invalid_key = ~sokey_type(0); + sokey_type begin_key = my_begin_node != nullptr ? my_begin_node->order_key() : invalid_key; + sokey_type end_key = my_end_node != nullptr ? my_end_node->order_key() : invalid_key; + + size_type mid_bucket = reverse_bits(begin_key + (end_key - begin_key) / 2) % + my_instance.my_bucket_count.load(std::memory_order_relaxed); + while( my_instance.my_segments[mid_bucket].load(std::memory_order_relaxed) == nullptr) { + mid_bucket = my_instance.get_parent(mid_bucket); + } + if (reverse_bits(mid_bucket) > begin_key) { + // Found a dummy node between begin and end + my_midpoint_node = my_instance.first_value_node( + my_instance.my_segments[mid_bucket].load(std::memory_order_relaxed)); + } else { + // Didn't find a dummy node between begin and end + my_midpoint_node = my_end_node; + } + } + } + }; // class const_range_type + + class range_type : public const_range_type { + public: + using iterator = typename concurrent_unordered_base::iterator; + using const_range_type::const_range_type; + + iterator begin() const { return iterator(const_range_type::begin().get_node_ptr()); } + iterator end() const { return iterator(const_range_type::end().get_node_ptr()); } + }; // class range_type + + // Parallel iteration + range_type range() { + return range_type(*this); + } + + const_range_type range() const { + return const_range_type(*this); + } +protected: + static constexpr bool allow_multimapping = traits_type::allow_multimapping; + +private: + static constexpr size_type initial_bucket_count = 8; + static constexpr float initial_max_load_factor = 4; // TODO: consider 1? + static constexpr size_type pointers_per_embedded_table = sizeof(size_type) * 8 - 1; + + class unordered_segment_table + : public segment_table, allocator_type, unordered_segment_table, pointers_per_embedded_table> + { + using self_type = unordered_segment_table; + using atomic_node_ptr = std::atomic; + using base_type = segment_table, allocator_type, unordered_segment_table, pointers_per_embedded_table>; + using segment_type = typename base_type::segment_type; + using base_allocator_type = typename base_type::allocator_type; + + using segment_allocator_type = typename allocator_traits_type::template rebind_alloc; + using segment_allocator_traits = tbb::detail::allocator_traits; + public: + // Segment table for unordered containers should not be extended in the wait- free implementation + static constexpr bool allow_table_extending = false; + static constexpr bool is_noexcept_assignment = std::is_nothrow_move_assignable::value && + std::is_nothrow_move_assignable::value && + segment_allocator_traits::is_always_equal::value; + static constexpr bool is_noexcept_swap = tbb::detail::is_nothrow_swappable::value && + tbb::detail::is_nothrow_swappable::value && + segment_allocator_traits::is_always_equal::value; + + // TODO: using base_type::base_type is not compiling on Windows and Intel Compiler - investigate + unordered_segment_table( const base_allocator_type& alloc = base_allocator_type() ) + : base_type(alloc) {} + + unordered_segment_table( const unordered_segment_table& ) = default; + + unordered_segment_table( const unordered_segment_table& other, const base_allocator_type& alloc ) + : base_type(other, alloc) {} + + unordered_segment_table( unordered_segment_table&& ) = default; + + unordered_segment_table( unordered_segment_table&& other, const base_allocator_type& alloc ) + : base_type(std::move(other), alloc) {} + + unordered_segment_table& operator=( const unordered_segment_table& ) = default; + + unordered_segment_table& operator=( unordered_segment_table&& ) = default; + + segment_type create_segment( typename base_type::segment_table_type, typename base_type::segment_index_type segment_index, size_type ) { + segment_allocator_type alloc(this->get_allocator()); + size_type seg_size = this->segment_size(segment_index); + segment_type new_segment = segment_allocator_traits::allocate(alloc, seg_size); + for (size_type i = 0; i != seg_size; ++i) { + segment_allocator_traits::construct(alloc, new_segment + i, nullptr); + } + return new_segment; + } + + segment_type nullify_segment( typename base_type::segment_table_type table, size_type segment_index ) { + segment_type target_segment = table[segment_index].load(std::memory_order_relaxed); + table[segment_index].store(nullptr, std::memory_order_relaxed); + return target_segment; + } + + // deallocate_segment is required by the segment_table base class, but + // in unordered, it is also necessary to call the destructor during deallocation + void deallocate_segment( segment_type address, size_type index ) { + destroy_segment(address, index); + } + + void destroy_segment( segment_type address, size_type index ) { + segment_allocator_type alloc(this->get_allocator()); + for (size_type i = 0; i != this->segment_size(index); ++i) { + segment_allocator_traits::destroy(alloc, address + i); + } + segment_allocator_traits::deallocate(alloc, address, this->segment_size(index)); + } + + + void copy_segment( size_type index, segment_type, segment_type to ) { + if (index == 0) { + // The first element in the first segment is embedded into the table (my_head) + // so the first pointer should not be stored here + // It would be stored during move ctor/assignment operation + to[1].store(nullptr, std::memory_order_relaxed); + } else { + for (size_type i = 0; i != this->segment_size(index); ++i) { + to[i].store(nullptr, std::memory_order_relaxed); + } + } + } + + void move_segment( size_type index, segment_type from, segment_type to ) { + if (index == 0) { + // The first element in the first segment is embedded into the table (my_head) + // so the first pointer should not be stored here + // It would be stored during move ctor/assignment operation + to[1].store(from[1].load(std::memory_order_relaxed), std::memory_order_relaxed); + } else { + for (size_type i = 0; i != this->segment_size(index); ++i) { + to[i].store(from[i].load(std::memory_order_relaxed), std::memory_order_relaxed); + from[i].store(nullptr, std::memory_order_relaxed); + } + } + } + + // allocate_long_table is required by the segment_table base class, but unused for unordered containers + typename base_type::segment_table_type allocate_long_table( const typename base_type::atomic_segment*, size_type ) { + __TBB_ASSERT(false, "This method should never been called"); + // TableType is a pointer + return nullptr; + } + + // destroy_elements is required by the segment_table base class, but unused for unordered containers + // this function call but do nothing + void destroy_elements() {} + }; // struct unordered_segment_table + + void internal_clear() { + // TODO: consider usefulness of two versions of clear() - with dummy nodes deallocation and without it + node_ptr next = my_head.next(); + node_ptr curr = next; + + my_head.set_next(nullptr); + + while (curr != nullptr) { + next = curr->next(); + destroy_node(curr); + curr = next; + } + + my_size.store(0, std::memory_order_relaxed); + my_segments.clear(); + } + + void destroy_node( node_ptr node ) { + if (node->is_dummy()) { + node_allocator_type dummy_node_allocator(my_segments.get_allocator()); + // Destroy the node + node_allocator_traits::destroy(dummy_node_allocator, node); + // Deallocate the memory + node_allocator_traits::deallocate(dummy_node_allocator, node, 1); + } else { + // GCC 11.1 issues a warning here that incorrect destructor might be called for dummy_nodes + #if (__TBB_GCC_VERSION >= 110100 && __TBB_GCC_VERSION < 130000 ) && !__clang__ && !__INTEL_COMPILER + volatile + #endif + value_node_ptr val_node = static_cast(node); + value_node_allocator_type value_node_allocator(my_segments.get_allocator()); + // Destroy the value + value_node_allocator_traits::destroy(value_node_allocator, val_node->storage()); + // Destroy the node + value_node_allocator_traits::destroy(value_node_allocator, val_node); + // Deallocate the memory + value_node_allocator_traits::deallocate(value_node_allocator, val_node, 1); + } + } + + struct internal_insert_return_type { + // If the insertion failed - the remaining_node points to the node, which was failed to insert + // This node can be allocated in process of insertion + value_node_ptr remaining_node; + // If the insertion failed - node_with_equal_key points to the node in the list with the + // key, equivalent to the inserted, otherwise it points to the node, which was inserted. + value_node_ptr node_with_equal_key; + // Insertion status + // NOTE: if it is true - remaining_node should be nullptr + bool inserted; + }; // struct internal_insert_return_type + + // Inserts the value into the split ordered list + template + std::pair internal_insert_value( ValueType&& value ) { + + auto create_value_node = [&value, this]( sokey_type order_key )->value_node_ptr { + return create_node(order_key, std::forward(value)); + }; + + auto insert_result = internal_insert(value, create_value_node); + + if (insert_result.remaining_node != nullptr) { + // If the insertion fails - destroy the node which was failed to insert if it exist + __TBB_ASSERT(!insert_result.inserted, + "remaining_node should be nullptr if the node was successfully inserted"); + destroy_node(insert_result.remaining_node); + } + + return { iterator(insert_result.node_with_equal_key), insert_result.inserted }; + } + + // Inserts the node into the split ordered list + // Creates a node using the specified callback after the place for insertion was found + // Returns internal_insert_return_type object, where: + // - If the insertion succeeded: + // - remaining_node is nullptr + // - node_with_equal_key point to the inserted node + // - inserted is true + // - If the insertion failed: + // - remaining_node points to the node, that was failed to insert if it was created. + // nullptr if the node was not created, because the requested key was already + // presented in the list + // - node_with_equal_key point to the element in the list with the key, equivalent to + // to the requested key + // - inserted is false + template + internal_insert_return_type internal_insert( ValueType&& value, CreateInsertNode create_insert_node ) { + static_assert(std::is_same::type, value_type>::value, + "Incorrect type in internal_insert"); + const key_type& key = traits_type::get_key(value); + sokey_type hash_key = sokey_type(my_hash_compare(key)); + + sokey_type order_key = split_order_key_regular(hash_key); + node_ptr prev = prepare_bucket(hash_key); + __TBB_ASSERT(prev != nullptr, "Invalid head node"); + + auto search_result = search_after(prev, order_key, key); + + if (search_result.second) { + return internal_insert_return_type{ nullptr, search_result.first, false }; + } + + value_node_ptr new_node = create_insert_node(order_key); + node_ptr curr = search_result.first; + + while (!try_insert(prev, new_node, curr)) { + search_result = search_after(prev, order_key, key); + if (search_result.second) { + return internal_insert_return_type{ new_node, search_result.first, false }; + } + curr = search_result.first; + } + + auto sz = my_size.fetch_add(1); + adjust_table_size(sz + 1, my_bucket_count.load(std::memory_order_acquire)); + return internal_insert_return_type{ nullptr, static_cast(new_node), true }; + } + + // Searches the node with the key, equivalent to key with requested order key after the node prev + // Returns the existing node and true if the node is already in the list + // Returns the first node with the order key, greater than requested and false if the node is not presented in the list + std::pair search_after( node_ptr& prev, sokey_type order_key, const key_type& key ) { + // NOTE: static_cast(curr) should be done only after we would ensure + // that the node is not a dummy node + + node_ptr curr = prev->next(); + + while (curr != nullptr && (curr->order_key() < order_key || + (curr->order_key() == order_key && !my_hash_compare(traits_type::get_key(static_cast(curr)->value()), key)))) + { + prev = curr; + curr = curr->next(); + } + + if (curr != nullptr && curr->order_key() == order_key && !allow_multimapping) { + return { static_cast(curr), true }; + } + return { static_cast(curr), false }; + } + + void adjust_table_size( size_type total_elements, size_type current_size ) { + // Grow the table by a factor of 2 if possible and needed + if ( (float(total_elements) / float(current_size)) > my_max_load_factor ) { + // Double the size of the hash only if size hash not changed in between loads + my_bucket_count.compare_exchange_strong(current_size, 2u * current_size); + } + } + + node_ptr insert_dummy_node( node_ptr parent_dummy_node, sokey_type order_key ) { + node_ptr prev_node = parent_dummy_node; + + node_ptr dummy_node = create_dummy_node(order_key); + node_ptr next_node; + + do { + next_node = prev_node->next(); + // Move forward through the list while the order key is less than requested + while (next_node != nullptr && next_node->order_key() < order_key) { + prev_node = next_node; + next_node = next_node->next(); + } + + if (next_node != nullptr && next_node->order_key() == order_key) { + // Another dummy node with the same order key was inserted by another thread + // Destroy the node and exit + destroy_node(dummy_node); + return next_node; + } + } while (!try_insert(prev_node, dummy_node, next_node)); + + return dummy_node; + } + + // Try to insert a node between prev_node and expected next + // If the next is not equal to expected next - return false + static bool try_insert( node_ptr prev_node, node_ptr new_node, node_ptr current_next_node ) { + new_node->set_next(current_next_node); + return prev_node->try_set_next(current_next_node, new_node); + } + + // Returns the bucket, associated with the hash_key + node_ptr prepare_bucket( sokey_type hash_key ) { + size_type bucket = hash_key % my_bucket_count.load(std::memory_order_acquire); + return get_bucket(bucket); + } + + // Initialize the corresponding bucket if it is not initialized + node_ptr get_bucket( size_type bucket_index ) { + if (my_segments[bucket_index].load(std::memory_order_acquire) == nullptr) { + init_bucket(bucket_index); + } + return my_segments[bucket_index].load(std::memory_order_acquire); + } + + void init_bucket( size_type bucket ) { + if (bucket == 0) { + // Atomicaly store the first bucket into my_head + node_ptr disabled = nullptr; + my_segments[0].compare_exchange_strong(disabled, &my_head); + return; + } + + size_type parent_bucket = get_parent(bucket); + + while (my_segments[parent_bucket].load(std::memory_order_acquire) == nullptr) { + // Initialize all of the parent buckets + init_bucket(parent_bucket); + } + + __TBB_ASSERT(my_segments[parent_bucket].load(std::memory_order_acquire) != nullptr, "Parent bucket should be initialized"); + node_ptr parent = my_segments[parent_bucket].load(std::memory_order_acquire); + + // Insert dummy node into the list + node_ptr dummy_node = insert_dummy_node(parent, split_order_key_dummy(bucket)); + // TODO: consider returning pair to avoid store operation if the bucket was stored by an other thread + // or move store to insert_dummy_node + // Add dummy_node into the segment table + my_segments[bucket].store(dummy_node, std::memory_order_release); + } + + node_ptr create_dummy_node( sokey_type order_key ) { + node_allocator_type dummy_node_allocator(my_segments.get_allocator()); + node_ptr dummy_node = node_allocator_traits::allocate(dummy_node_allocator, 1); + node_allocator_traits::construct(dummy_node_allocator, dummy_node, order_key); + return dummy_node; + } + + template + value_node_ptr create_node( sokey_type order_key, Args&&... args ) { + value_node_allocator_type value_node_allocator(my_segments.get_allocator()); + // Allocate memory for the value_node + value_node_ptr new_node = value_node_allocator_traits::allocate(value_node_allocator, 1); + // Construct the node + value_node_allocator_traits::construct(value_node_allocator, new_node, order_key); + + // try_call API is not convenient here due to broken + // variadic capture on GCC 4.8.5 + auto value_guard = make_raii_guard([&] { + value_node_allocator_traits::destroy(value_node_allocator, new_node); + value_node_allocator_traits::deallocate(value_node_allocator, new_node, 1); + }); + + // Construct the value in the node + value_node_allocator_traits::construct(value_node_allocator, new_node->storage(), std::forward(args)...); + value_guard.dismiss(); + return new_node; + } + + value_node_ptr first_value_node( node_ptr first_node ) const { + while (first_node != nullptr && first_node->is_dummy()) { + first_node = first_node->next(); + } + return static_cast(first_node); + } + + // Unsafe method, which removes the node from the list and returns the next node + node_ptr internal_erase( value_node_ptr node_to_erase ) { + __TBB_ASSERT(node_to_erase != nullptr, "Invalid iterator for erase"); + node_ptr next_node = node_to_erase->next(); + internal_extract(node_to_erase); + destroy_node(node_to_erase); + return next_node; + } + + template + size_type internal_erase_by_key( const K& key ) { + // TODO: consider reimplementation without equal_range - it is not effective to perform lookup over a bucket + // for each unsafe_erase call + auto eq_range = equal_range(key); + size_type erased_count = 0; + + for (auto it = eq_range.first; it != eq_range.second;) { + it = unsafe_erase(it); + ++erased_count; + } + return erased_count; + } + + // Unsafe method, which extracts the node from the list + void internal_extract( value_node_ptr node_to_extract ) { + const key_type& key = traits_type::get_key(node_to_extract->value()); + sokey_type hash_key = sokey_type(my_hash_compare(key)); + + node_ptr prev_node = prepare_bucket(hash_key); + + for (node_ptr node = prev_node->next(); node != nullptr; prev_node = node, node = node->next()) { + if (node == node_to_extract) { + unlink_node(prev_node, node, node_to_extract->next()); + my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed); + return; + } + __TBB_ASSERT(node->order_key() <= node_to_extract->order_key(), + "node, which is going to be extracted should be presented in the list"); + } + } + +protected: + template + void internal_merge( SourceType&& source ) { + static_assert(std::is_same::type::node_type>::value, + "Incompatible containers cannot be merged"); + + for (node_ptr source_prev = &source.my_head; source_prev->next() != nullptr;) { + if (!source_prev->next()->is_dummy()) { + value_node_ptr curr = static_cast(source_prev->next()); + // If the multimapping is allowed, or the key is not presented + // in the *this container - extract the node from the list + if (allow_multimapping || !contains(traits_type::get_key(curr->value()))) { + node_ptr next_node = curr->next(); + source.unlink_node(source_prev, curr, next_node); + + // Remember the old order key + sokey_type old_order_key = curr->order_key(); + + // Node handle with curr cannot be used directly in insert call, because + // the destructor of node_type will destroy curr + node_type curr_node = node_handle_accessor::construct(curr); + + // If the insertion fails - return ownership of the node to the source + if (!insert(std::move(curr_node)).second) { + __TBB_ASSERT(!allow_multimapping, "Insertion should succeed for multicontainer"); + __TBB_ASSERT(source_prev->next() == next_node, + "Concurrent operations with the source container in merge are prohibited"); + + // Initialize the node with the old order key, because the order key + // can change during the insertion + curr->init(old_order_key); + __TBB_ASSERT(old_order_key >= source_prev->order_key() && + (next_node == nullptr || old_order_key <= next_node->order_key()), + "Wrong nodes order in the source container"); + // Merge is unsafe for source container, so the insertion back can be done without compare_exchange + curr->set_next(next_node); + source_prev->set_next(curr); + source_prev = curr; + node_handle_accessor::deactivate(curr_node); + } else { + source.my_size.fetch_sub(1, std::memory_order_relaxed); + } + } else { + source_prev = curr; + } + } else { + source_prev = source_prev->next(); + } + } + } + +private: + // Unsafe method, which unlinks the node between prev and next + void unlink_node( node_ptr prev_node, node_ptr node_to_unlink, node_ptr next_node ) { + __TBB_ASSERT(prev_node->next() == node_to_unlink && + node_to_unlink->next() == next_node, + "erasing and extracting nodes from the containers are unsafe in concurrent mode"); + prev_node->set_next(next_node); + node_to_unlink->set_next(nullptr); + } + + template + value_node_ptr internal_find( const K& key ) { + sokey_type hash_key = sokey_type(my_hash_compare(key)); + sokey_type order_key = split_order_key_regular(hash_key); + + node_ptr curr = prepare_bucket(hash_key); + + while (curr != nullptr) { + if (curr->order_key() > order_key) { + // If the order key is greater than the requested order key, + // the element is not in the hash table + return nullptr; + } else if (curr->order_key() == order_key && + my_hash_compare(traits_type::get_key(static_cast(curr)->value()), key)) { + // The fact that order keys match does not mean that the element is found. + // Key function comparison has to be performed to check whether this is the + // right element. If not, keep searching while order key is the same. + return static_cast(curr); + } + curr = curr->next(); + } + + return nullptr; + } + + template + std::pair internal_equal_range( const K& key ) { + sokey_type hash_key = sokey_type(my_hash_compare(key)); + sokey_type order_key = split_order_key_regular(hash_key); + + node_ptr curr = prepare_bucket(hash_key); + + while (curr != nullptr) { + if (curr->order_key() > order_key) { + // If the order key is greater than the requested order key, + // the element is not in the hash table + return std::make_pair(nullptr, nullptr); + } else if (curr->order_key() == order_key && + my_hash_compare(traits_type::get_key(static_cast(curr)->value()), key)) { + value_node_ptr first = static_cast(curr); + node_ptr last = first; + do { + last = last->next(); + } while (allow_multimapping && last != nullptr && !last->is_dummy() && + my_hash_compare(traits_type::get_key(static_cast(last)->value()), key)); + return std::make_pair(first, first_value_node(last)); + } + curr = curr->next(); + } + return {nullptr, nullptr}; + } + + template + size_type internal_count( const K& key ) const { + if (allow_multimapping) { + // TODO: consider reimplementing the internal_equal_range with elements counting to avoid std::distance + auto eq_range = equal_range(key); + return std::distance(eq_range.first, eq_range.second); + } else { + return contains(key) ? 1 : 0; + } + } + + void internal_copy( const concurrent_unordered_base& other ) { + node_ptr last_node = &my_head; + my_segments[0].store(&my_head, std::memory_order_relaxed); + + for (node_ptr node = other.my_head.next(); node != nullptr; node = node->next()) { + node_ptr new_node; + if (!node->is_dummy()) { + // The node in the right table contains a value + new_node = create_node(node->order_key(), static_cast(node)->value()); + } else { + // The node in the right table is a dummy node + new_node = create_dummy_node(node->order_key()); + my_segments[reverse_bits(node->order_key())].store(new_node, std::memory_order_relaxed); + } + + last_node->set_next(new_node); + last_node = new_node; + } + } + + void internal_move( concurrent_unordered_base&& other ) { + node_ptr last_node = &my_head; + my_segments[0].store(&my_head, std::memory_order_relaxed); + + for (node_ptr node = other.my_head.next(); node != nullptr; node = node->next()) { + node_ptr new_node; + if (!node->is_dummy()) { + // The node in the right table contains a value + new_node = create_node(node->order_key(), std::move(static_cast(node)->value())); + } else { + // TODO: do we need to destroy a dummy node in the right container? + // The node in the right table is a dummy_node + new_node = create_dummy_node(node->order_key()); + my_segments[reverse_bits(node->order_key())].store(new_node, std::memory_order_relaxed); + } + + last_node->set_next(new_node); + last_node = new_node; + } + } + + void move_content( concurrent_unordered_base&& other ) { + // NOTE: allocators should be equal + my_head.set_next(other.my_head.next()); + other.my_head.set_next(nullptr); + my_segments[0].store(&my_head, std::memory_order_relaxed); + + other.my_bucket_count.store(initial_bucket_count, std::memory_order_relaxed); + other.my_max_load_factor = initial_max_load_factor; + other.my_size.store(0, std::memory_order_relaxed); + } + + void internal_move_construct_with_allocator( concurrent_unordered_base&& other, const allocator_type&, + /*is_always_equal = */std::true_type ) { + // Allocators are always equal - no need to compare for equality + move_content(std::move(other)); + } + + void internal_move_construct_with_allocator( concurrent_unordered_base&& other, const allocator_type& alloc, + /*is_always_equal = */std::false_type ) { + // Allocators are not always equal + if (alloc == other.my_segments.get_allocator()) { + move_content(std::move(other)); + } else { + try_call( [&] { + internal_move(std::move(other)); + } ).on_exception( [&] { + clear(); + }); + } + } + + // Move assigns the hash table to other is any instances of allocator_type are always equal + // or propagate_on_container_move_assignment is true + void internal_move_assign( concurrent_unordered_base&& other, /*is_always_equal || POCMA = */std::true_type ) { + move_content(std::move(other)); + } + + // Move assigns the hash table to other is any instances of allocator_type are not always equal + // and propagate_on_container_move_assignment is false + void internal_move_assign( concurrent_unordered_base&& other, /*is_always_equal || POCMA = */std::false_type ) { + if (my_segments.get_allocator() == other.my_segments.get_allocator()) { + move_content(std::move(other)); + } else { + // TODO: guards for exceptions + internal_move(std::move(other)); + } + } + + void internal_swap( concurrent_unordered_base& other, /*is_always_equal || POCS = */std::true_type ) { + internal_swap_fields(other); + } + + void internal_swap( concurrent_unordered_base& other, /*is_always_equal || POCS = */std::false_type ) { + __TBB_ASSERT(my_segments.get_allocator() == other.my_segments.get_allocator(), + "Swapping with unequal allocators is not allowed"); + internal_swap_fields(other); + } + + void internal_swap_fields( concurrent_unordered_base& other ) { + node_ptr first_node = my_head.next(); + my_head.set_next(other.my_head.next()); + other.my_head.set_next(first_node); + + size_type current_size = my_size.load(std::memory_order_relaxed); + my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.my_size.store(current_size, std::memory_order_relaxed); + + size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed); + my_bucket_count.store(other.my_bucket_count.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.my_bucket_count.store(bucket_count, std::memory_order_relaxed); + + using std::swap; + swap(my_max_load_factor, other.my_max_load_factor); + swap(my_hash_compare, other.my_hash_compare); + my_segments.swap(other.my_segments); + + // swap() method from segment table swaps all of the segments including the first segment + // We should restore it to my_head. Without it the first segment of the container will point + // to other.my_head. + my_segments[0].store(&my_head, std::memory_order_relaxed); + other.my_segments[0].store(&other.my_head, std::memory_order_relaxed); + } + + // A regular order key has its original hash value reversed and the last bit set + static constexpr sokey_type split_order_key_regular( sokey_type hash ) { + return reverse_bits(hash) | 0x1; + } + + // A dummy order key has its original hash value reversed and the last bit unset + static constexpr sokey_type split_order_key_dummy( sokey_type hash ) { + return reverse_bits(hash) & ~sokey_type(0x1); + } + + size_type get_parent( size_type bucket ) const { + // Unset bucket's most significant turned-on bit + __TBB_ASSERT(bucket != 0, "Unable to get_parent of the bucket 0"); + size_type msb = tbb::detail::log2(bucket); + return bucket & ~(size_type(1) << msb); + } + + size_type get_next_bucket_index( size_type bucket ) const { + size_type bits = tbb::detail::log2(my_bucket_count.load(std::memory_order_relaxed)); + size_type reversed_next = reverse_n_bits(bucket, bits) + 1; + return reverse_n_bits(reversed_next, bits); + } + + std::atomic my_size; + std::atomic my_bucket_count; + float my_max_load_factor; + hash_compare_type my_hash_compare; + + list_node_type my_head; // Head node for split ordered list + unordered_segment_table my_segments; // Segment table of pointers to nodes + + template + friend class solist_iterator; + + template + friend class concurrent_unordered_base; +}; // class concurrent_unordered_base + +template +bool operator==( const concurrent_unordered_base& lhs, + const concurrent_unordered_base& rhs ) { + if (&lhs == &rhs) { return true; } + if (lhs.size() != rhs.size()) { return false; } + +#if _MSC_VER + // Passing "unchecked" iterators to std::permutation with 3 parameters + // causes compiler warnings. + // The workaround is to use overload with 4 parameters, which is + // available since C++14 - minimally supported version on MSVC + return std::is_permutation(lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); +#else + return std::is_permutation(lhs.begin(), lhs.end(), rhs.begin()); +#endif +} + +#if !__TBB_CPP20_COMPARISONS_PRESENT +template +bool operator!=( const concurrent_unordered_base& lhs, + const concurrent_unordered_base& rhs ) { + return !(lhs == rhs); +} +#endif + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma warning(pop) // warning 4127 is back +#endif + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_detail__concurrent_unordered_base_H diff --git a/third_party/tbb/detail/_config.h b/third_party/tbb/detail/_config.h new file mode 100644 index 000000000..ae3383243 --- /dev/null +++ b/third_party/tbb/detail/_config.h @@ -0,0 +1,530 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__config_H +#define __TBB_detail__config_H + +/** This header is supposed to contain macro definitions only. + The macros defined here are intended to control such aspects of TBB build as + - presence of compiler features + - compilation modes + - feature sets + - known compiler/platform issues +**/ + +/* Check which standard library we use. */ +#include "third_party/libcxx/cstddef" + +#ifdef __has_include +#if __has_include() +#include "third_party/libcxx/version" +#endif +#endif + +#include "third_party/tbb/detail/_export.h" + +#if _MSC_VER + #define __TBB_EXPORTED_FUNC __cdecl + #define __TBB_EXPORTED_METHOD __thiscall +#else + #define __TBB_EXPORTED_FUNC + #define __TBB_EXPORTED_METHOD +#endif + +#if defined(_MSVC_LANG) + #define __TBB_LANG _MSVC_LANG +#else + #define __TBB_LANG __cplusplus +#endif // _MSVC_LANG + +#define __TBB_CPP14_PRESENT (__TBB_LANG >= 201402L) +#define __TBB_CPP17_PRESENT (__TBB_LANG >= 201703L) +#define __TBB_CPP20_PRESENT (__TBB_LANG >= 202002L) + +#if __INTEL_COMPILER || _MSC_VER + #define __TBB_NOINLINE(decl) __declspec(noinline) decl +#elif __GNUC__ + #define __TBB_NOINLINE(decl) decl __attribute__ ((noinline)) +#else + #define __TBB_NOINLINE(decl) decl +#endif + +#define __TBB_STRING_AUX(x) #x +#define __TBB_STRING(x) __TBB_STRING_AUX(x) + +// Note that when ICC or Clang is in use, __TBB_GCC_VERSION might not fully match +// the actual GCC version on the system. +#define __TBB_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) + +/* Check which standard library we use. */ + +// Prior to GCC 7, GNU libstdc++ did not have a convenient version macro. +// Therefore we use different ways to detect its version. +#ifdef TBB_USE_GLIBCXX_VERSION + // The version is explicitly specified in our public TBB_USE_GLIBCXX_VERSION macro. + // Its format should match the __TBB_GCC_VERSION above, e.g. 70301 for libstdc++ coming with GCC 7.3.1. + #define __TBB_GLIBCXX_VERSION TBB_USE_GLIBCXX_VERSION +#elif _GLIBCXX_RELEASE && _GLIBCXX_RELEASE != __GNUC__ + // Reported versions of GCC and libstdc++ do not match; trust the latter + #define __TBB_GLIBCXX_VERSION (_GLIBCXX_RELEASE*10000) +#elif __GLIBCPP__ || __GLIBCXX__ + // The version macro is not defined or matches the GCC version; use __TBB_GCC_VERSION + #define __TBB_GLIBCXX_VERSION __TBB_GCC_VERSION +#endif + +#if __clang__ + // according to clang documentation, version can be vendor specific + #define __TBB_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) +#endif + +/** Macro helpers **/ + +#define __TBB_CONCAT_AUX(A,B) A##B +// The additional level of indirection is needed to expand macros A and B (not to get the AB macro). +// See [cpp.subst] and [cpp.concat] for more details. +#define __TBB_CONCAT(A,B) __TBB_CONCAT_AUX(A,B) +// The IGNORED argument and comma are needed to always have 2 arguments (even when A is empty). +#define __TBB_IS_MACRO_EMPTY(A,IGNORED) __TBB_CONCAT_AUX(__TBB_MACRO_EMPTY,A) +#define __TBB_MACRO_EMPTY 1 + +#if _M_X64 || _M_ARM64 + #define __TBB_W(name) name##64 +#else + #define __TBB_W(name) name +#endif + +/** User controlled TBB features & modes **/ + +#ifndef TBB_USE_DEBUG + /* + There are four cases that are supported: + 1. "_DEBUG is undefined" means "no debug"; + 2. "_DEBUG defined to something that is evaluated to 0" (including "garbage", as per [cpp.cond]) means "no debug"; + 3. "_DEBUG defined to something that is evaluated to a non-zero value" means "debug"; + 4. "_DEBUG defined to nothing (empty)" means "debug". + */ + #ifdef _DEBUG + // Check if _DEBUG is empty. + #define __TBB_IS__DEBUG_EMPTY (__TBB_IS_MACRO_EMPTY(_DEBUG,IGNORED)==__TBB_MACRO_EMPTY) + #if __TBB_IS__DEBUG_EMPTY + #define TBB_USE_DEBUG 1 + #else + #define TBB_USE_DEBUG _DEBUG + #endif // __TBB_IS__DEBUG_EMPTY + #else + #define TBB_USE_DEBUG 0 + #endif // _DEBUG +#endif // TBB_USE_DEBUG + +#ifndef TBB_USE_ASSERT + #define TBB_USE_ASSERT TBB_USE_DEBUG +#endif // TBB_USE_ASSERT + +#ifndef TBB_USE_PROFILING_TOOLS +#if TBB_USE_DEBUG + #define TBB_USE_PROFILING_TOOLS 2 +#else // TBB_USE_DEBUG + #define TBB_USE_PROFILING_TOOLS 0 +#endif // TBB_USE_DEBUG +#endif // TBB_USE_PROFILING_TOOLS + +// Exceptions support cases +#if !(__EXCEPTIONS || defined(_CPPUNWIND) || __SUNPRO_CC) + #if TBB_USE_EXCEPTIONS + #error Compilation settings do not support exception handling. Please do not set TBB_USE_EXCEPTIONS macro or set it to 0. + #elif !defined(TBB_USE_EXCEPTIONS) + #define TBB_USE_EXCEPTIONS 0 + #endif +#elif !defined(TBB_USE_EXCEPTIONS) + #define TBB_USE_EXCEPTIONS 1 +#endif + +/** Preprocessor symbols to determine HW architecture **/ + +#if _WIN32 || _WIN64 + #if defined(_M_X64) || defined(__x86_64__) // the latter for MinGW support + #define __TBB_x86_64 1 + #elif defined(_M_IA64) + #define __TBB_ipf 1 + #elif defined(_M_IX86) || defined(__i386__) // the latter for MinGW support + #define __TBB_x86_32 1 + #else + #define __TBB_generic_arch 1 + #endif +#else /* Assume generic Unix */ + #if __x86_64__ + #define __TBB_x86_64 1 + #elif __ia64__ + #define __TBB_ipf 1 + #elif __i386__||__i386 // __i386 is for Sun OS + #define __TBB_x86_32 1 + #else + #define __TBB_generic_arch 1 + #endif +#endif + +/** Windows API or POSIX API **/ + +#if _WIN32 || _WIN64 + #define __TBB_USE_WINAPI 1 +#else + #define __TBB_USE_POSIX 1 +#endif + +/** Internal TBB features & modes **/ + +/** __TBB_DYNAMIC_LOAD_ENABLED describes the system possibility to load shared libraries at run time **/ +#ifndef __TBB_DYNAMIC_LOAD_ENABLED + #define __TBB_DYNAMIC_LOAD_ENABLED 1 +#endif + +/** __TBB_WIN8UI_SUPPORT enables support of Windows* Store Apps and limit a possibility to load + shared libraries at run time only from application container **/ +#if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP + #define __TBB_WIN8UI_SUPPORT 1 +#else + #define __TBB_WIN8UI_SUPPORT 0 +#endif + +/** __TBB_WEAK_SYMBOLS_PRESENT denotes that the system supports the weak symbol mechanism **/ +#ifndef __TBB_WEAK_SYMBOLS_PRESENT + #define __TBB_WEAK_SYMBOLS_PRESENT ( !_WIN32 && !__APPLE__ && !__sun && (__TBB_GCC_VERSION >= 40000 || __INTEL_COMPILER ) ) +#endif + +/** Presence of compiler features **/ + +#if __clang__ && !__INTEL_COMPILER + #define __TBB_USE_OPTIONAL_RTTI __has_feature(cxx_rtti) +#elif defined(_CPPRTTI) + #define __TBB_USE_OPTIONAL_RTTI 1 +#else + #define __TBB_USE_OPTIONAL_RTTI (__GXX_RTTI || __RTTI || __INTEL_RTTI__) +#endif + +/** Address sanitizer detection **/ +#ifdef __SANITIZE_ADDRESS__ + #define __TBB_USE_ADDRESS_SANITIZER 1 +#elif defined(__has_feature) +#if __has_feature(address_sanitizer) + #define __TBB_USE_ADDRESS_SANITIZER 1 +#endif +#endif + +/** Library features presence macros **/ + +#define __TBB_CPP14_INTEGER_SEQUENCE_PRESENT (__TBB_LANG >= 201402L) +#define __TBB_CPP17_INVOKE_PRESENT (__TBB_LANG >= 201703L) + +// TODO: Remove the condition(__INTEL_COMPILER > 2021) from the __TBB_CPP17_DEDUCTION_GUIDES_PRESENT +// macro when this feature start working correctly on this compiler. +#if __INTEL_COMPILER && (!_MSC_VER || __INTEL_CXX11_MOVE__) + #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L) + #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (__INTEL_COMPILER > 2021 && __TBB_LANG >= 201703L) + #define __TBB_CPP20_CONCEPTS_PRESENT 0 // TODO: add a mechanism for future addition +#elif __clang__ + #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__has_feature(cxx_variable_templates)) + #define __TBB_CPP20_CONCEPTS_PRESENT 0 // TODO: add a mechanism for future addition + #ifdef __cpp_deduction_guides + #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (__cpp_deduction_guides >= 201611L) + #else + #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT 0 + #endif +#elif __GNUC__ + #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L && __TBB_GCC_VERSION >= 50000) + #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (__cpp_deduction_guides >= 201606L) + #define __TBB_CPP20_CONCEPTS_PRESENT (__TBB_LANG >= 201709L && __TBB_GCC_VERSION >= 100201) +#elif _MSC_VER + #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (_MSC_FULL_VER >= 190023918 && (!__INTEL_COMPILER || __INTEL_COMPILER >= 1700)) + #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (_MSC_VER >= 1914 && __TBB_LANG >= 201703L && (!__INTEL_COMPILER || __INTEL_COMPILER > 2021)) + #define __TBB_CPP20_CONCEPTS_PRESENT (_MSC_VER >= 1923 && __TBB_LANG >= 202002L) // TODO: INTEL_COMPILER? +#else + #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L) + #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (__TBB_LANG >= 201703L) + #define __TBB_CPP20_CONCEPTS_PRESENT (__TBB_LANG >= 202002L) +#endif + +// GCC4.8 on RHEL7 does not support std::get_new_handler +#define __TBB_CPP11_GET_NEW_HANDLER_PRESENT (_MSC_VER >= 1900 || __TBB_GLIBCXX_VERSION >= 40900 && __GXX_EXPERIMENTAL_CXX0X__ || _LIBCPP_VERSION) +// GCC4.8 on RHEL7 does not support std::is_trivially_copyable +#define __TBB_CPP11_TYPE_PROPERTIES_PRESENT (_LIBCPP_VERSION || _MSC_VER >= 1700 || (__TBB_GLIBCXX_VERSION >= 50000 && __GXX_EXPERIMENTAL_CXX0X__)) + +#define __TBB_CPP17_MEMORY_RESOURCE_PRESENT (_MSC_VER >= 1913 && (__TBB_LANG > 201402L) || \ + __TBB_GLIBCXX_VERSION >= 90000 && __TBB_LANG >= 201703L) +#define __TBB_CPP17_HW_INTERFERENCE_SIZE_PRESENT (_MSC_VER >= 1911) +#define __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT (__TBB_LANG >= 201703L) +#define __TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT (__TBB_LANG >= 201703L) +#define __TBB_CPP17_IS_SWAPPABLE_PRESENT (__TBB_LANG >= 201703L) + +#if defined(__cpp_impl_three_way_comparison) && defined(__cpp_lib_three_way_comparison) + #define __TBB_CPP20_COMPARISONS_PRESENT ((__cpp_impl_three_way_comparison >= 201907L) && (__cpp_lib_three_way_comparison >= 201907L)) +#else + #define __TBB_CPP20_COMPARISONS_PRESENT 0 +#endif + +#define __TBB_RESUMABLE_TASKS (!__TBB_WIN8UI_SUPPORT && !__ANDROID__ && !__QNXNTO__ && (!__linux__ || __GLIBC__)) + +/* This macro marks incomplete code or comments describing ideas which are considered for the future. + * See also for plain comment with TODO and FIXME marks for small improvement opportunities. + */ +#define __TBB_TODO 0 + +/* Check which standard library we use. */ +/* __TBB_SYMBOL is defined only while processing exported symbols list where C++ is not allowed. */ +#if !defined(__TBB_SYMBOL) && !__TBB_CONFIG_PREPROC_ONLY + #include "third_party/libcxx/cstddef" +#endif + +/** Target OS is either iOS* or iOS* simulator **/ +#if __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ + #define __TBB_IOS 1 +#endif + +#if __APPLE__ + #if __INTEL_COMPILER && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1099 \ + && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101000 + // ICC does not correctly set the macro if -mmacosx-min-version is not specified + #define __TBB_MACOS_TARGET_VERSION (100000 + 10*(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ - 1000)) + #else + #define __TBB_MACOS_TARGET_VERSION __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ + #endif +#endif + +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) + #define __TBB_GCC_WARNING_IGNORED_ATTRIBUTES_PRESENT (__TBB_GCC_VERSION >= 60100) +#endif + +#if __GNUC__ && !__INTEL_COMPILER && !__clang__ + #define __TBB_GCC_PARAMETER_PACK_IN_LAMBDAS_BROKEN (__TBB_GCC_VERSION <= 40805) +#endif + +#define __TBB_CPP17_FALLTHROUGH_PRESENT (__TBB_LANG >= 201703L) +#define __TBB_CPP17_NODISCARD_PRESENT (__TBB_LANG >= 201703L) +#define __TBB_FALLTHROUGH_PRESENT (__TBB_GCC_VERSION >= 70000 && !__INTEL_COMPILER) + +#if __TBB_CPP17_FALLTHROUGH_PRESENT + #define __TBB_fallthrough [[fallthrough]] +#elif __TBB_FALLTHROUGH_PRESENT + #define __TBB_fallthrough __attribute__ ((fallthrough)) +#else + #define __TBB_fallthrough +#endif + +#if __TBB_CPP17_NODISCARD_PRESENT + #define __TBB_nodiscard [[nodiscard]] +#elif __clang__ || __GNUC__ + #define __TBB_nodiscard __attribute__((warn_unused_result)) +#else + #define __TBB_nodiscard +#endif + +#define __TBB_CPP17_UNCAUGHT_EXCEPTIONS_PRESENT (_MSC_VER >= 1900 || __GLIBCXX__ && __cpp_lib_uncaught_exceptions \ + || _LIBCPP_VERSION >= 3700 && (!__TBB_MACOS_TARGET_VERSION || __TBB_MACOS_TARGET_VERSION >= 101200)) + +#define __TBB_TSX_INTRINSICS_PRESENT (__RTM__ || __INTEL_COMPILER || (_MSC_VER>=1700 && (__TBB_x86_64 || __TBB_x86_32))) + +#define __TBB_WAITPKG_INTRINSICS_PRESENT ((__INTEL_COMPILER >= 1900 || __TBB_GCC_VERSION >= 110000 || __TBB_CLANG_VERSION >= 120000) \ + && (_WIN32 || _WIN64 || __unix__ || __APPLE__) && (__TBB_x86_32 || __TBB_x86_64) && !__ANDROID__) + +/** Internal TBB features & modes **/ + +/** __TBB_SOURCE_DIRECTLY_INCLUDED is a mode used in whitebox testing when + it's necessary to test internal functions not exported from TBB DLLs +**/ +#if (_WIN32||_WIN64) && (__TBB_SOURCE_DIRECTLY_INCLUDED || TBB_USE_PREVIEW_BINARY) + #define __TBB_NO_IMPLICIT_LINKAGE 1 + #define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 +#endif + +#if (__TBB_BUILD || __TBBMALLOC_BUILD || __TBBMALLOCPROXY_BUILD || __TBBBIND_BUILD) && !defined(__TBB_NO_IMPLICIT_LINKAGE) + #define __TBB_NO_IMPLICIT_LINKAGE 1 +#endif + +#if _MSC_VER + #if !__TBB_NO_IMPLICIT_LINKAGE + #ifdef _DEBUG + #pragma comment(lib, "tbb12_debug.lib") + #else + #pragma comment(lib, "tbb12.lib") + #endif + #endif +#endif + +#ifndef __TBB_SCHEDULER_OBSERVER + #define __TBB_SCHEDULER_OBSERVER 1 +#endif /* __TBB_SCHEDULER_OBSERVER */ + +#ifndef __TBB_FP_CONTEXT + #define __TBB_FP_CONTEXT 1 +#endif /* __TBB_FP_CONTEXT */ + +#define __TBB_RECYCLE_TO_ENQUEUE __TBB_BUILD // keep non-official + +#ifndef __TBB_ARENA_OBSERVER + #define __TBB_ARENA_OBSERVER __TBB_SCHEDULER_OBSERVER +#endif /* __TBB_ARENA_OBSERVER */ + +#ifndef __TBB_ARENA_BINDING + #define __TBB_ARENA_BINDING 1 +#endif + +#ifndef __TBB_ENQUEUE_ENFORCED_CONCURRENCY + #define __TBB_ENQUEUE_ENFORCED_CONCURRENCY 1 +#endif + +#if !defined(__TBB_SURVIVE_THREAD_SWITCH) && \ + (_WIN32 || _WIN64 || __APPLE__ || (defined(__unix__) && !__ANDROID__)) + #define __TBB_SURVIVE_THREAD_SWITCH 1 +#endif /* __TBB_SURVIVE_THREAD_SWITCH */ + +#ifndef TBB_PREVIEW_FLOW_GRAPH_FEATURES + #define TBB_PREVIEW_FLOW_GRAPH_FEATURES __TBB_CPF_BUILD +#endif + +#ifndef __TBB_DEFAULT_PARTITIONER + #define __TBB_DEFAULT_PARTITIONER tbb::auto_partitioner +#endif + +#ifndef __TBB_FLOW_TRACE_CODEPTR + #define __TBB_FLOW_TRACE_CODEPTR __TBB_CPF_BUILD +#endif + +// Intel(R) C++ Compiler starts analyzing usages of the deprecated content at the template +// instantiation site, which is too late for suppression of the corresponding messages for internal +// stuff. +#if !defined(__INTEL_COMPILER) && (!defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)) + #if (__TBB_LANG >= 201402L && (!defined(_MSC_VER) || _MSC_VER >= 1920)) + #define __TBB_DEPRECATED [[deprecated]] + #define __TBB_DEPRECATED_MSG(msg) [[deprecated(msg)]] + #elif _MSC_VER + #define __TBB_DEPRECATED __declspec(deprecated) + #define __TBB_DEPRECATED_MSG(msg) __declspec(deprecated(msg)) + #elif (__GNUC__ && __TBB_GCC_VERSION >= 40805) || __clang__ + #define __TBB_DEPRECATED __attribute__((deprecated)) + #define __TBB_DEPRECATED_MSG(msg) __attribute__((deprecated(msg))) + #endif +#endif // !defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0) + +#if !defined(__TBB_DEPRECATED) + #define __TBB_DEPRECATED + #define __TBB_DEPRECATED_MSG(msg) +#elif !defined(__TBB_SUPPRESS_INTERNAL_DEPRECATED_MESSAGES) + // Suppress deprecated messages from self + #define __TBB_SUPPRESS_INTERNAL_DEPRECATED_MESSAGES 1 +#endif + +#if defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) && (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0) + #define __TBB_DEPRECATED_VERBOSE __TBB_DEPRECATED + #define __TBB_DEPRECATED_VERBOSE_MSG(msg) __TBB_DEPRECATED_MSG(msg) +#else + #define __TBB_DEPRECATED_VERBOSE + #define __TBB_DEPRECATED_VERBOSE_MSG(msg) +#endif // (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0) + +#if (!defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)) && !(__TBB_LANG >= 201103L || _MSC_VER >= 1900) + #pragma message("TBB Warning: Support for C++98/03 is deprecated. Please use the compiler that supports C++11 features at least.") +#endif + +#ifdef _VARIADIC_MAX + #define __TBB_VARIADIC_MAX _VARIADIC_MAX +#else + #if _MSC_VER == 1700 + #define __TBB_VARIADIC_MAX 5 // VS11 setting, issue resolved in VS12 + #elif _MSC_VER == 1600 + #define __TBB_VARIADIC_MAX 10 // VS10 setting + #else + #define __TBB_VARIADIC_MAX 15 + #endif +#endif + +#if __SANITIZE_THREAD__ + #define __TBB_USE_THREAD_SANITIZER 1 +#elif defined(__has_feature) +#if __has_feature(thread_sanitizer) + #define __TBB_USE_THREAD_SANITIZER 1 +#endif +#endif + +#ifndef __TBB_USE_SANITIZERS +#define __TBB_USE_SANITIZERS (__TBB_USE_THREAD_SANITIZER || __TBB_USE_ADDRESS_SANITIZER) +#endif + +#ifndef __TBB_RESUMABLE_TASKS_USE_THREADS +#define __TBB_RESUMABLE_TASKS_USE_THREADS __TBB_USE_SANITIZERS +#endif + +#ifndef __TBB_USE_CONSTRAINTS +#define __TBB_USE_CONSTRAINTS 1 +#endif + +#ifndef __TBB_STRICT_CONSTRAINTS +#define __TBB_STRICT_CONSTRAINTS 1 +#endif + +#if __TBB_CPP20_CONCEPTS_PRESENT && __TBB_USE_CONSTRAINTS + #define __TBB_requires(...) requires __VA_ARGS__ +#else // __TBB_CPP20_CONCEPTS_PRESENT + #define __TBB_requires(...) +#endif // __TBB_CPP20_CONCEPTS_PRESENT + +/** Macros of the form __TBB_XXX_BROKEN denote known issues that are caused by + the bugs in compilers, standard or OS specific libraries. They should be + removed as soon as the corresponding bugs are fixed or the buggy OS/compiler + versions go out of the support list. +**/ + +// Some STL containers not support allocator traits in old GCC versions +#if __GXX_EXPERIMENTAL_CXX0X__ && __TBB_GLIBCXX_VERSION <= 50301 + #define TBB_ALLOCATOR_TRAITS_BROKEN 1 +#endif + +// GCC 4.8 C++ standard library implements std::this_thread::yield as no-op. +#if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900 + #define __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN 1 +#endif + +/** End of __TBB_XXX_BROKEN macro section **/ + +#if defined(_MSC_VER) && _MSC_VER>=1500 && !defined(__INTEL_COMPILER) + // A macro to suppress erroneous or benign "unreachable code" MSVC warning (4702) + #define __TBB_MSVC_UNREACHABLE_CODE_IGNORED 1 +#endif + +// Many OS versions (Android 4.0.[0-3] for example) need workaround for dlopen to avoid non-recursive loader lock hang +// Setting the workaround for all compile targets ($APP_PLATFORM) below Android 4.4 (android-19) +#if __ANDROID__ + // MISSING #include +#endif + +#define __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING (TBB_PREVIEW_FLOW_GRAPH_FEATURES) + +#ifndef __TBB_PREVIEW_CRITICAL_TASKS +#define __TBB_PREVIEW_CRITICAL_TASKS 1 +#endif + +#ifndef __TBB_PREVIEW_FLOW_GRAPH_NODE_SET +#define __TBB_PREVIEW_FLOW_GRAPH_NODE_SET (TBB_PREVIEW_FLOW_GRAPH_FEATURES) +#endif + +#if TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS +#define __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS 1 +#endif + +#if TBB_PREVIEW_TASK_GROUP_EXTENSIONS || __TBB_BUILD +#define __TBB_PREVIEW_TASK_GROUP_EXTENSIONS 1 +#endif + +#endif // __TBB_detail__config_H diff --git a/third_party/tbb/detail/_containers_helpers.h b/third_party/tbb/detail/_containers_helpers.h new file mode 100644 index 000000000..a583a911c --- /dev/null +++ b/third_party/tbb/detail/_containers_helpers.h @@ -0,0 +1,68 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__containers_helpers_H +#define __TBB_detail__containers_helpers_H + +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_allocator_traits.h" +#include "third_party/libcxx/type_traits" +#include "third_party/libcxx/memory" +#include "third_party/libcxx/functional" + +namespace tbb { +namespace detail { +inline namespace d0 { + +template +struct comp_is_transparent : std::false_type {}; + +template +struct comp_is_transparent> : std::true_type {}; + +template +struct has_transparent_key_equal : std::false_type { using type = KeyEqual; }; + +template +struct has_transparent_key_equal> : std::true_type { + using type = typename Hasher::transparent_key_equal; + static_assert(comp_is_transparent::value, "Hash::transparent_key_equal::is_transparent is not valid or does not denote a type."); + static_assert((std::is_same>::value || + std::is_same::value), "KeyEqual is a different type than equal_to or Hash::transparent_key_equal."); + }; + +struct is_iterator_impl { +template +using iter_traits_category = typename std::iterator_traits::iterator_category; + +template +using input_iter_category = typename std::enable_if>::value>::type; +}; // struct is_iterator_impl + +template +using is_input_iterator = supports; + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT +template +inline constexpr bool is_input_iterator_v = is_input_iterator::value; +#endif + +} // inline namespace d0 +} // namespace detail +} // namespace tbb + +#endif // __TBB_detail__containers_helpers_H diff --git a/third_party/tbb/detail/_exception.h b/third_party/tbb/detail/_exception.h new file mode 100644 index 000000000..e209862f2 --- /dev/null +++ b/third_party/tbb/detail/_exception.h @@ -0,0 +1,89 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB__exception_H +#define __TBB__exception_H + +#include "third_party/tbb/detail/_config.h" + +#include "third_party/libcxx/new" // std::bad_alloc +#include "third_party/libcxx/exception" // std::exception +#include "third_party/libcxx/stdexcept" // std::runtime_error + +namespace tbb { +namespace detail { +inline namespace d0 { +enum class exception_id { + bad_alloc = 1, + bad_last_alloc, + user_abort, + nonpositive_step, + out_of_range, + reservation_length_error, + missing_wait, + invalid_load_factor, + invalid_key, + bad_tagged_msg_cast, + unsafe_wait, + last_entry +}; +} // namespace d0 + +#if _MSC_VER + #pragma warning(disable: 4275) +#endif + +namespace r1 { +//! Exception for concurrent containers +class TBB_EXPORT bad_last_alloc : public std::bad_alloc { +public: + const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override; +}; + +//! Exception for user-initiated abort +class TBB_EXPORT user_abort : public std::exception { +public: + const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override; +}; + +//! Exception for missing wait on structured_task_group +class TBB_EXPORT missing_wait : public std::exception { +public: + const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override; +}; + +//! Exception for impossible finalization of task_sheduler_handle +class TBB_EXPORT unsafe_wait : public std::runtime_error { +public: + unsafe_wait(const char* msg) : std::runtime_error(msg) {} +}; + +//! Gathers all throw operators in one place. +/** Its purpose is to minimize code bloat that can be caused by throw operators + scattered in multiple places, especially in templates. **/ +TBB_EXPORT void __TBB_EXPORTED_FUNC throw_exception ( exception_id ); +} // namespace r1 + +inline namespace d0 { +using r1::throw_exception; +} // namespace d0 + +} // namespace detail +} // namespace tbb + +#endif // __TBB__exception_H + diff --git a/third_party/tbb/detail/_export.h b/third_party/tbb/detail/_export.h new file mode 100644 index 000000000..515095917 --- /dev/null +++ b/third_party/tbb/detail/_export.h @@ -0,0 +1,47 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__export_H +#define __TBB_detail__export_H + +#if defined(__MINGW32__) + #define _EXPORT __declspec(dllexport) +#elif defined(_WIN32) || defined(__unix__) || defined(__APPLE__) // Use .def files for these + #define _EXPORT +#else + #error "Unknown platform/compiler" +#endif + +#if __TBB_BUILD + #define TBB_EXPORT _EXPORT +#else + #define TBB_EXPORT +#endif + +#if __TBBMALLOC_BUILD + #define TBBMALLOC_EXPORT _EXPORT +#else + #define TBBMALLOC_EXPORT +#endif + +#if __TBBBIND_BUILD + #define TBBBIND_EXPORT _EXPORT +#else + #define TBBBIND_EXPORT +#endif + +#endif diff --git a/third_party/tbb/detail/_flow_graph_body_impl.h b/third_party/tbb/detail/_flow_graph_body_impl.h new file mode 100644 index 000000000..8515c94be --- /dev/null +++ b/third_party/tbb/detail/_flow_graph_body_impl.h @@ -0,0 +1,386 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB__flow_graph_body_impl_H +#define __TBB__flow_graph_body_impl_H + +#ifndef __TBB_flow_graph_H +#error Do not #include this internal file directly; use public TBB headers instead. +#endif + +// included in namespace tbb::detail::d1 (in flow_graph.h) + +typedef std::uint64_t tag_value; + + +// TODO revamp: find out if there is already helper for has_policy. +template struct Policy {}; + +template struct has_policy; + +template +struct has_policy : + std::integral_constant::value || + has_policy::value> {}; + +template +struct has_policy : + std::integral_constant::value> {}; + +template +struct has_policy > : has_policy {}; + +namespace graph_policy_namespace { + + struct rejecting { }; + struct reserving { }; + struct queueing { }; + struct lightweight { }; + + // K == type of field used for key-matching. Each tag-matching port will be provided + // functor that, given an object accepted by the port, will return the + /// field of type K being used for matching. + template::type > > + __TBB_requires(tbb::detail::hash_compare) + struct key_matching { + typedef K key_type; + typedef typename std::decay::type base_key_type; + typedef KHash hash_compare_type; + }; + + // old tag_matching join's new specifier + typedef key_matching tag_matching; + + // Aliases for Policy combinations + typedef Policy queueing_lightweight; + typedef Policy rejecting_lightweight; + +} // namespace graph_policy_namespace + +// -------------- function_body containers ---------------------- + +//! A functor that takes no input and generates a value of type Output +template< typename Output > +class input_body : no_assign { +public: + virtual ~input_body() {} + virtual Output operator()(flow_control& fc) = 0; + virtual input_body* clone() = 0; +}; + +//! The leaf for input_body +template< typename Output, typename Body> +class input_body_leaf : public input_body { +public: + input_body_leaf( const Body &_body ) : body(_body) { } + Output operator()(flow_control& fc) override { return body(fc); } + input_body_leaf* clone() override { + return new input_body_leaf< Output, Body >(body); + } + Body get_body() { return body; } +private: + Body body; +}; + +//! A functor that takes an Input and generates an Output +template< typename Input, typename Output > +class function_body : no_assign { +public: + virtual ~function_body() {} + virtual Output operator()(const Input &input) = 0; + virtual function_body* clone() = 0; +}; + +//! the leaf for function_body +template +class function_body_leaf : public function_body< Input, Output > { +public: + function_body_leaf( const B &_body ) : body(_body) { } + Output operator()(const Input &i) override { return tbb::detail::invoke(body,i); } + B get_body() { return body; } + function_body_leaf* clone() override { + return new function_body_leaf< Input, Output, B >(body); + } +private: + B body; +}; + +//! the leaf for function_body specialized for Input and output of continue_msg +template +class function_body_leaf< continue_msg, continue_msg, B> : public function_body< continue_msg, continue_msg > { +public: + function_body_leaf( const B &_body ) : body(_body) { } + continue_msg operator()( const continue_msg &i ) override { + body(i); + return i; + } + B get_body() { return body; } + function_body_leaf* clone() override { + return new function_body_leaf< continue_msg, continue_msg, B >(body); + } +private: + B body; +}; + +//! the leaf for function_body specialized for Output of continue_msg +template +class function_body_leaf< Input, continue_msg, B> : public function_body< Input, continue_msg > { +public: + function_body_leaf( const B &_body ) : body(_body) { } + continue_msg operator()(const Input &i) override { + body(i); + return continue_msg(); + } + B get_body() { return body; } + function_body_leaf* clone() override { + return new function_body_leaf< Input, continue_msg, B >(body); + } +private: + B body; +}; + +//! the leaf for function_body specialized for Input of continue_msg +template +class function_body_leaf< continue_msg, Output, B > : public function_body< continue_msg, Output > { +public: + function_body_leaf( const B &_body ) : body(_body) { } + Output operator()(const continue_msg &i) override { + return body(i); + } + B get_body() { return body; } + function_body_leaf* clone() override { + return new function_body_leaf< continue_msg, Output, B >(body); + } +private: + B body; +}; + +//! function_body that takes an Input and a set of output ports +template +class multifunction_body : no_assign { +public: + virtual ~multifunction_body () {} + virtual void operator()(const Input &/* input*/, OutputSet &/*oset*/) = 0; + virtual multifunction_body* clone() = 0; + virtual void* get_body_ptr() = 0; +}; + +//! leaf for multifunction. OutputSet can be a std::tuple or a vector. +template +class multifunction_body_leaf : public multifunction_body { +public: + multifunction_body_leaf(const B &_body) : body(_body) { } + void operator()(const Input &input, OutputSet &oset) override { + tbb::detail::invoke(body, input, oset); // body may explicitly put() to one or more of oset. + } + void* get_body_ptr() override { return &body; } + multifunction_body_leaf* clone() override { + return new multifunction_body_leaf(body); + } + +private: + B body; +}; + +// ------ function bodies for hash_buffers and key-matching joins. + +template +class type_to_key_function_body : no_assign { + public: + virtual ~type_to_key_function_body() {} + virtual Output operator()(const Input &input) = 0; // returns an Output + virtual type_to_key_function_body* clone() = 0; +}; + +// specialization for ref output +template +class type_to_key_function_body : no_assign { + public: + virtual ~type_to_key_function_body() {} + virtual const Output & operator()(const Input &input) = 0; // returns a const Output& + virtual type_to_key_function_body* clone() = 0; +}; + +template +class type_to_key_function_body_leaf : public type_to_key_function_body { +public: + type_to_key_function_body_leaf( const B &_body ) : body(_body) { } + Output operator()(const Input &i) override { return tbb::detail::invoke(body, i); } + type_to_key_function_body_leaf* clone() override { + return new type_to_key_function_body_leaf< Input, Output, B>(body); + } +private: + B body; +}; + +template +class type_to_key_function_body_leaf : public type_to_key_function_body< Input, Output&> { +public: + type_to_key_function_body_leaf( const B &_body ) : body(_body) { } + const Output& operator()(const Input &i) override { + return tbb::detail::invoke(body, i); + } + type_to_key_function_body_leaf* clone() override { + return new type_to_key_function_body_leaf< Input, Output&, B>(body); + } +private: + B body; +}; + +// --------------------------- end of function_body containers ------------------------ + +// --------------------------- node task bodies --------------------------------------- + +//! A task that calls a node's forward_task function +template< typename NodeType > +class forward_task_bypass : public graph_task { + NodeType &my_node; +public: + forward_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n + , node_priority_t node_priority = no_priority + ) : graph_task(g, allocator, node_priority), + my_node(n) {} + + task* execute(execution_data& ed) override { + graph_task* next_task = my_node.forward_task(); + if (SUCCESSFULLY_ENQUEUED == next_task) + next_task = nullptr; + else if (next_task) + next_task = prioritize_task(my_node.graph_reference(), *next_task); + finalize(ed); + return next_task; + } + + task* cancel(execution_data& ed) override { + finalize(ed); + return nullptr; + } +}; + +//! A task that calls a node's apply_body_bypass function, passing in an input of type Input +// return the task* unless it is SUCCESSFULLY_ENQUEUED, in which case return nullptr +template< typename NodeType, typename Input > +class apply_body_task_bypass : public graph_task { + NodeType &my_node; + Input my_input; +public: + + apply_body_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n, const Input &i + , node_priority_t node_priority = no_priority + ) : graph_task(g, allocator, node_priority), + my_node(n), my_input(i) {} + + task* execute(execution_data& ed) override { + graph_task* next_task = my_node.apply_body_bypass( my_input ); + if (SUCCESSFULLY_ENQUEUED == next_task) + next_task = nullptr; + else if (next_task) + next_task = prioritize_task(my_node.graph_reference(), *next_task); + finalize(ed); + return next_task; + } + + task* cancel(execution_data& ed) override { + finalize(ed); + return nullptr; + } +}; + +//! A task that calls a node's apply_body_bypass function with no input +template< typename NodeType > +class input_node_task_bypass : public graph_task { + NodeType &my_node; +public: + input_node_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n ) + : graph_task(g, allocator), my_node(n) {} + + task* execute(execution_data& ed) override { + graph_task* next_task = my_node.apply_body_bypass( ); + if (SUCCESSFULLY_ENQUEUED == next_task) + next_task = nullptr; + else if (next_task) + next_task = prioritize_task(my_node.graph_reference(), *next_task); + finalize(ed); + return next_task; + } + + task* cancel(execution_data& ed) override { + finalize(ed); + return nullptr; + } +}; + +// ------------------------ end of node task bodies ----------------------------------- + +template +class threshold_regulator; + +template +class threshold_regulator::value>::type> + : public receiver, no_copy +{ + T* my_node; +protected: + + graph_task* try_put_task( const DecrementType& value ) override { + graph_task* result = my_node->decrement_counter( value ); + if( !result ) + result = SUCCESSFULLY_ENQUEUED; + return result; + } + + graph& graph_reference() const override { + return my_node->my_graph; + } + + template friend class limiter_node; + void reset_receiver( reset_flags ) {} + +public: + threshold_regulator(T* owner) : my_node(owner) { + // Do not work with the passed pointer here as it may not be fully initialized yet + } +}; + +template +class threshold_regulator : public continue_receiver, no_copy { + + T *my_node; + + graph_task* execute() override { + return my_node->decrement_counter( 1 ); + } + +protected: + + graph& graph_reference() const override { + return my_node->my_graph; + } + +public: + + typedef continue_msg input_type; + typedef continue_msg output_type; + threshold_regulator(T* owner) + : continue_receiver( /*number_of_predecessors=*/0, no_priority ), my_node(owner) + { + // Do not work with the passed pointer here as it may not be fully initialized yet + } +}; + +#endif // __TBB__flow_graph_body_impl_H diff --git a/third_party/tbb/detail/_flow_graph_cache_impl.h b/third_party/tbb/detail/_flow_graph_cache_impl.h new file mode 100644 index 000000000..b75545324 --- /dev/null +++ b/third_party/tbb/detail/_flow_graph_cache_impl.h @@ -0,0 +1,435 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB__flow_graph_cache_impl_H +#define __TBB__flow_graph_cache_impl_H + +#ifndef __TBB_flow_graph_H +#error Do not #include this internal file directly; use public TBB headers instead. +#endif + +// included in namespace tbb::detail::d1 (in flow_graph.h) + +//! A node_cache maintains a std::queue of elements of type T. Each operation is protected by a lock. +template< typename T, typename M=spin_mutex > +class node_cache { + public: + + typedef size_t size_type; + + bool empty() { + typename mutex_type::scoped_lock lock( my_mutex ); + return internal_empty(); + } + + void add( T &n ) { + typename mutex_type::scoped_lock lock( my_mutex ); + internal_push(n); + } + + void remove( T &n ) { + typename mutex_type::scoped_lock lock( my_mutex ); + for ( size_t i = internal_size(); i != 0; --i ) { + T &s = internal_pop(); + if ( &s == &n ) + break; // only remove one predecessor per request + internal_push(s); + } + } + + void clear() { + while( !my_q.empty()) (void)my_q.pop(); + } + +protected: + + typedef M mutex_type; + mutex_type my_mutex; + std::queue< T * > my_q; + + // Assumes lock is held + inline bool internal_empty( ) { + return my_q.empty(); + } + + // Assumes lock is held + inline size_type internal_size( ) { + return my_q.size(); + } + + // Assumes lock is held + inline void internal_push( T &n ) { + my_q.push(&n); + } + + // Assumes lock is held + inline T &internal_pop() { + T *v = my_q.front(); + my_q.pop(); + return *v; + } + +}; + +//! A cache of predecessors that only supports try_get +template< typename T, typename M=spin_mutex > +class predecessor_cache : public node_cache< sender, M > { +public: + typedef M mutex_type; + typedef T output_type; + typedef sender predecessor_type; + typedef receiver successor_type; + + predecessor_cache( successor_type* owner ) : my_owner( owner ) { + __TBB_ASSERT( my_owner, "predecessor_cache should have an owner." ); + // Do not work with the passed pointer here as it may not be fully initialized yet + } + + bool get_item( output_type& v ) { + + bool msg = false; + + do { + predecessor_type *src; + { + typename mutex_type::scoped_lock lock(this->my_mutex); + if ( this->internal_empty() ) { + break; + } + src = &this->internal_pop(); + } + + // Try to get from this sender + msg = src->try_get( v ); + + if (msg == false) { + // Relinquish ownership of the edge + register_successor(*src, *my_owner); + } else { + // Retain ownership of the edge + this->add(*src); + } + } while ( msg == false ); + return msg; + } + + // If we are removing arcs (rf_clear_edges), call clear() rather than reset(). + void reset() { + for(;;) { + predecessor_type *src; + { + if (this->internal_empty()) break; + src = &this->internal_pop(); + } + register_successor(*src, *my_owner); + } + } + +protected: + successor_type* my_owner; +}; + +//! An cache of predecessors that supports requests and reservations +template< typename T, typename M=spin_mutex > +class reservable_predecessor_cache : public predecessor_cache< T, M > { +public: + typedef M mutex_type; + typedef T output_type; + typedef sender predecessor_type; + typedef receiver successor_type; + + reservable_predecessor_cache( successor_type* owner ) + : predecessor_cache(owner), reserved_src(nullptr) + { + // Do not work with the passed pointer here as it may not be fully initialized yet + } + + bool try_reserve( output_type &v ) { + bool msg = false; + + do { + predecessor_type* pred = nullptr; + { + typename mutex_type::scoped_lock lock(this->my_mutex); + if ( reserved_src.load(std::memory_order_relaxed) || this->internal_empty() ) + return false; + + pred = &this->internal_pop(); + reserved_src.store(pred, std::memory_order_relaxed); + } + + // Try to get from this sender + msg = pred->try_reserve( v ); + + if (msg == false) { + typename mutex_type::scoped_lock lock(this->my_mutex); + // Relinquish ownership of the edge + register_successor( *pred, *this->my_owner ); + reserved_src.store(nullptr, std::memory_order_relaxed); + } else { + // Retain ownership of the edge + this->add( *pred); + } + } while ( msg == false ); + + return msg; + } + + bool try_release() { + reserved_src.load(std::memory_order_relaxed)->try_release(); + reserved_src.store(nullptr, std::memory_order_relaxed); + return true; + } + + bool try_consume() { + reserved_src.load(std::memory_order_relaxed)->try_consume(); + reserved_src.store(nullptr, std::memory_order_relaxed); + return true; + } + + void reset() { + reserved_src.store(nullptr, std::memory_order_relaxed); + predecessor_cache::reset(); + } + + void clear() { + reserved_src.store(nullptr, std::memory_order_relaxed); + predecessor_cache::clear(); + } + +private: + std::atomic reserved_src; +}; + + +//! An abstract cache of successors +template +class successor_cache : no_copy { +protected: + + typedef M mutex_type; + mutex_type my_mutex; + + typedef receiver successor_type; + typedef receiver* pointer_type; + typedef sender owner_type; + // TODO revamp: introduce heapified collection of successors for strict priorities + typedef std::list< pointer_type > successors_type; + successors_type my_successors; + + owner_type* my_owner; + +public: + successor_cache( owner_type* owner ) : my_owner(owner) { + // Do not work with the passed pointer here as it may not be fully initialized yet + } + + virtual ~successor_cache() {} + + void register_successor( successor_type& r ) { + typename mutex_type::scoped_lock l(my_mutex, true); + if( r.priority() != no_priority ) + my_successors.push_front( &r ); + else + my_successors.push_back( &r ); + } + + void remove_successor( successor_type& r ) { + typename mutex_type::scoped_lock l(my_mutex, true); + for ( typename successors_type::iterator i = my_successors.begin(); + i != my_successors.end(); ++i ) { + if ( *i == & r ) { + my_successors.erase(i); + break; + } + } + } + + bool empty() { + typename mutex_type::scoped_lock l(my_mutex, false); + return my_successors.empty(); + } + + void clear() { + my_successors.clear(); + } + + virtual graph_task* try_put_task( const T& t ) = 0; +}; // successor_cache + +//! An abstract cache of successors, specialized to continue_msg +template +class successor_cache< continue_msg, M > : no_copy { +protected: + + typedef M mutex_type; + mutex_type my_mutex; + + typedef receiver successor_type; + typedef receiver* pointer_type; + typedef sender owner_type; + typedef std::list< pointer_type > successors_type; + successors_type my_successors; + owner_type* my_owner; + +public: + successor_cache( sender* owner ) : my_owner(owner) { + // Do not work with the passed pointer here as it may not be fully initialized yet + } + + virtual ~successor_cache() {} + + void register_successor( successor_type& r ) { + typename mutex_type::scoped_lock l(my_mutex, true); + if( r.priority() != no_priority ) + my_successors.push_front( &r ); + else + my_successors.push_back( &r ); + __TBB_ASSERT( my_owner, "Cache of successors must have an owner." ); + if ( r.is_continue_receiver() ) { + r.register_predecessor( *my_owner ); + } + } + + void remove_successor( successor_type& r ) { + typename mutex_type::scoped_lock l(my_mutex, true); + for ( successors_type::iterator i = my_successors.begin(); i != my_successors.end(); ++i ) { + if ( *i == &r ) { + __TBB_ASSERT(my_owner, "Cache of successors must have an owner."); + // TODO: check if we need to test for continue_receiver before removing from r. + r.remove_predecessor( *my_owner ); + my_successors.erase(i); + break; + } + } + } + + bool empty() { + typename mutex_type::scoped_lock l(my_mutex, false); + return my_successors.empty(); + } + + void clear() { + my_successors.clear(); + } + + virtual graph_task* try_put_task( const continue_msg& t ) = 0; +}; // successor_cache< continue_msg > + +//! A cache of successors that are broadcast to +template +class broadcast_cache : public successor_cache { + typedef successor_cache base_type; + typedef M mutex_type; + typedef typename successor_cache::successors_type successors_type; + +public: + + broadcast_cache( typename base_type::owner_type* owner ): base_type(owner) { + // Do not work with the passed pointer here as it may not be fully initialized yet + } + + // as above, but call try_put_task instead, and return the last task we received (if any) + graph_task* try_put_task( const T &t ) override { + graph_task * last_task = nullptr; + typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true); + typename successors_type::iterator i = this->my_successors.begin(); + while ( i != this->my_successors.end() ) { + graph_task *new_task = (*i)->try_put_task(t); + // workaround for icc bug + graph& graph_ref = (*i)->graph_reference(); + last_task = combine_tasks(graph_ref, last_task, new_task); // enqueue if necessary + if(new_task) { + ++i; + } + else { // failed + if ( (*i)->register_predecessor(*this->my_owner) ) { + i = this->my_successors.erase(i); + } else { + ++i; + } + } + } + return last_task; + } + + // call try_put_task and return list of received tasks + bool gather_successful_try_puts( const T &t, graph_task_list& tasks ) { + bool is_at_least_one_put_successful = false; + typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true); + typename successors_type::iterator i = this->my_successors.begin(); + while ( i != this->my_successors.end() ) { + graph_task * new_task = (*i)->try_put_task(t); + if(new_task) { + ++i; + if(new_task != SUCCESSFULLY_ENQUEUED) { + tasks.push_back(*new_task); + } + is_at_least_one_put_successful = true; + } + else { // failed + if ( (*i)->register_predecessor(*this->my_owner) ) { + i = this->my_successors.erase(i); + } else { + ++i; + } + } + } + return is_at_least_one_put_successful; + } +}; + +//! A cache of successors that are put in a round-robin fashion +template +class round_robin_cache : public successor_cache { + typedef successor_cache base_type; + typedef size_t size_type; + typedef M mutex_type; + typedef typename successor_cache::successors_type successors_type; + +public: + + round_robin_cache( typename base_type::owner_type* owner ): base_type(owner) { + // Do not work with the passed pointer here as it may not be fully initialized yet + } + + size_type size() { + typename mutex_type::scoped_lock l(this->my_mutex, false); + return this->my_successors.size(); + } + + graph_task* try_put_task( const T &t ) override { + typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true); + typename successors_type::iterator i = this->my_successors.begin(); + while ( i != this->my_successors.end() ) { + graph_task* new_task = (*i)->try_put_task(t); + if ( new_task ) { + return new_task; + } else { + if ( (*i)->register_predecessor(*this->my_owner) ) { + i = this->my_successors.erase(i); + } + else { + ++i; + } + } + } + return nullptr; + } +}; + +#endif // __TBB__flow_graph_cache_impl_H diff --git a/third_party/tbb/detail/_flow_graph_impl.h b/third_party/tbb/detail/_flow_graph_impl.h new file mode 100644 index 000000000..38ee6bf9e --- /dev/null +++ b/third_party/tbb/detail/_flow_graph_impl.h @@ -0,0 +1,477 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_flow_graph_impl_H +#define __TBB_flow_graph_impl_H + +// // MISSING #include "../config.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/task_group.h" +#include "third_party/tbb/task_arena.h" +#include "third_party/tbb/flow_graph_abstractions.h" + +#include "third_party/tbb/concurrent_priority_queue.h" + +#include "third_party/libcxx/list" + +namespace tbb { +namespace detail { + +namespace d1 { + +class graph_task; +static graph_task* const SUCCESSFULLY_ENQUEUED = (graph_task*)-1; +typedef unsigned int node_priority_t; +static const node_priority_t no_priority = node_priority_t(0); + +class graph; +class graph_node; + +template +class graph_iterator { + friend class graph; + friend class graph_node; +public: + typedef size_t size_type; + typedef GraphNodeType value_type; + typedef GraphNodeType* pointer; + typedef GraphNodeType& reference; + typedef const GraphNodeType& const_reference; + typedef std::forward_iterator_tag iterator_category; + + //! Copy constructor + graph_iterator(const graph_iterator& other) : + my_graph(other.my_graph), current_node(other.current_node) + {} + + //! Assignment + graph_iterator& operator=(const graph_iterator& other) { + if (this != &other) { + my_graph = other.my_graph; + current_node = other.current_node; + } + return *this; + } + + //! Dereference + reference operator*() const; + + //! Dereference + pointer operator->() const; + + //! Equality + bool operator==(const graph_iterator& other) const { + return ((my_graph == other.my_graph) && (current_node == other.current_node)); + } + +#if !__TBB_CPP20_COMPARISONS_PRESENT + //! Inequality + bool operator!=(const graph_iterator& other) const { return !(operator==(other)); } +#endif + + //! Pre-increment + graph_iterator& operator++() { + internal_forward(); + return *this; + } + + //! Post-increment + graph_iterator operator++(int) { + graph_iterator result = *this; + operator++(); + return result; + } + +private: + // the graph over which we are iterating + GraphContainerType *my_graph; + // pointer into my_graph's my_nodes list + pointer current_node; + + //! Private initializing constructor for begin() and end() iterators + graph_iterator(GraphContainerType *g, bool begin); + void internal_forward(); +}; // class graph_iterator + +// flags to modify the behavior of the graph reset(). Can be combined. +enum reset_flags { + rf_reset_protocol = 0, + rf_reset_bodies = 1 << 0, // delete the current node body, reset to a copy of the initial node body. + rf_clear_edges = 1 << 1 // delete edges +}; + +void activate_graph(graph& g); +void deactivate_graph(graph& g); +bool is_graph_active(graph& g); +graph_task* prioritize_task(graph& g, graph_task& arena_task); +void spawn_in_graph_arena(graph& g, graph_task& arena_task); +void enqueue_in_graph_arena(graph &g, graph_task& arena_task); + +class graph; + +//! Base class for tasks generated by graph nodes. +class graph_task : public task { +public: + graph_task(graph& g, small_object_allocator& allocator + , node_priority_t node_priority = no_priority + ) + : my_graph(g) + , priority(node_priority) + , my_allocator(allocator) + {} + graph& my_graph; // graph instance the task belongs to + // TODO revamp: rename to my_priority + node_priority_t priority; + template + void destruct_and_deallocate(const execution_data& ed); +protected: + template + void finalize(const execution_data& ed); +private: + // To organize task_list + graph_task* my_next{ nullptr }; + small_object_allocator my_allocator; + // TODO revamp: elaborate internal interfaces to avoid friends declarations + friend class graph_task_list; + friend graph_task* prioritize_task(graph& g, graph_task& gt); +}; + +struct graph_task_comparator { + bool operator()(const graph_task* left, const graph_task* right) { + return left->priority < right->priority; + } +}; + +typedef tbb::concurrent_priority_queue graph_task_priority_queue_t; + +class priority_task_selector : public task { +public: + priority_task_selector(graph_task_priority_queue_t& priority_queue, small_object_allocator& allocator) + : my_priority_queue(priority_queue), my_allocator(allocator), my_task() {} + task* execute(execution_data& ed) override { + next_task(); + __TBB_ASSERT(my_task, nullptr); + task* t_next = my_task->execute(ed); + my_allocator.delete_object(this, ed); + return t_next; + } + task* cancel(execution_data& ed) override { + if (!my_task) { + next_task(); + } + __TBB_ASSERT(my_task, nullptr); + task* t_next = my_task->cancel(ed); + my_allocator.delete_object(this, ed); + return t_next; + } +private: + void next_task() { + // TODO revamp: hold functors in priority queue instead of real tasks + bool result = my_priority_queue.try_pop(my_task); + __TBB_ASSERT_EX(result, "Number of critical tasks for scheduler and tasks" + " in graph's priority queue mismatched"); + __TBB_ASSERT(my_task && my_task != SUCCESSFULLY_ENQUEUED, + "Incorrect task submitted to graph priority queue"); + __TBB_ASSERT(my_task->priority != no_priority, + "Tasks from graph's priority queue must have priority"); + } + + graph_task_priority_queue_t& my_priority_queue; + small_object_allocator my_allocator; + graph_task* my_task; +}; + +template class run_and_put_task; +template class run_task; + +//******************************************************************************** +// graph tasks helpers +//******************************************************************************** + +//! The list of graph tasks +class graph_task_list : no_copy { +private: + graph_task* my_first; + graph_task** my_next_ptr; +public: + //! Construct empty list + graph_task_list() : my_first(nullptr), my_next_ptr(&my_first) {} + + //! True if list is empty; false otherwise. + bool empty() const { return !my_first; } + + //! Push task onto back of list. + void push_back(graph_task& task) { + task.my_next = nullptr; + *my_next_ptr = &task; + my_next_ptr = &task.my_next; + } + + //! Pop the front task from the list. + graph_task& pop_front() { + __TBB_ASSERT(!empty(), "attempt to pop item from empty task_list"); + graph_task* result = my_first; + my_first = result->my_next; + if (!my_first) { + my_next_ptr = &my_first; + } + return *result; + } +}; + +//! The graph class +/** This class serves as a handle to the graph */ +class graph : no_copy, public graph_proxy { + friend class graph_node; + + void prepare_task_arena(bool reinit = false) { + if (reinit) { + __TBB_ASSERT(my_task_arena, "task arena is nullptr"); + my_task_arena->terminate(); + my_task_arena->initialize(task_arena::attach()); + } + else { + __TBB_ASSERT(my_task_arena == nullptr, "task arena is not nullptr"); + my_task_arena = new task_arena(task_arena::attach()); + } + if (!my_task_arena->is_active()) // failed to attach + my_task_arena->initialize(); // create a new, default-initialized arena + __TBB_ASSERT(my_task_arena->is_active(), "task arena is not active"); + } + +public: + //! Constructs a graph with isolated task_group_context + graph(); + + //! Constructs a graph with use_this_context as context + explicit graph(task_group_context& use_this_context); + + //! Destroys the graph. + /** Calls wait_for_all, then destroys the root task and context. */ + ~graph(); + + //! Used to register that an external entity may still interact with the graph. + /** The graph will not return from wait_for_all until a matching number of release_wait calls is + made. */ + void reserve_wait() override; + + //! Deregisters an external entity that may have interacted with the graph. + /** The graph will not return from wait_for_all until all the number of reserve_wait calls + matches the number of release_wait calls. */ + void release_wait() override; + + //! Wait until graph is idle and the number of release_wait calls equals to the number of + //! reserve_wait calls. + /** The waiting thread will go off and steal work while it is blocked in the wait_for_all. */ + void wait_for_all() { + cancelled = false; + caught_exception = false; + try_call([this] { + my_task_arena->execute([this] { + wait(my_wait_context, *my_context); + }); + cancelled = my_context->is_group_execution_cancelled(); + }).on_exception([this] { + my_context->reset(); + caught_exception = true; + cancelled = true; + }); + // TODO: the "if" condition below is just a work-around to support the concurrent wait + // mode. The cancellation and exception mechanisms are still broken in this mode. + // Consider using task group not to re-implement the same functionality. + if (!(my_context->traits() & task_group_context::concurrent_wait)) { + my_context->reset(); // consistent with behavior in catch() + } + } + + // TODO revamp: consider adding getter for task_group_context. + + // ITERATORS + template + friend class graph_iterator; + + // Graph iterator typedefs + typedef graph_iterator iterator; + typedef graph_iterator const_iterator; + + // Graph iterator constructors + //! start iterator + iterator begin(); + //! end iterator + iterator end(); + //! start const iterator + const_iterator begin() const; + //! end const iterator + const_iterator end() const; + //! start const iterator + const_iterator cbegin() const; + //! end const iterator + const_iterator cend() const; + + // thread-unsafe state reset. + void reset(reset_flags f = rf_reset_protocol); + + //! cancels execution of the associated task_group_context + void cancel(); + + //! return status of graph execution + bool is_cancelled() { return cancelled; } + bool exception_thrown() { return caught_exception; } + +private: + wait_context my_wait_context; + task_group_context *my_context; + bool own_context; + bool cancelled; + bool caught_exception; + bool my_is_active; + + graph_node *my_nodes, *my_nodes_last; + + tbb::spin_mutex nodelist_mutex; + void register_node(graph_node *n); + void remove_node(graph_node *n); + + task_arena* my_task_arena; + + graph_task_priority_queue_t my_priority_queue; + + friend void activate_graph(graph& g); + friend void deactivate_graph(graph& g); + friend bool is_graph_active(graph& g); + friend graph_task* prioritize_task(graph& g, graph_task& arena_task); + friend void spawn_in_graph_arena(graph& g, graph_task& arena_task); + friend void enqueue_in_graph_arena(graph &g, graph_task& arena_task); + + friend class task_arena_base; + +}; // class graph + +template +inline void graph_task::destruct_and_deallocate(const execution_data& ed) { + auto allocator = my_allocator; + // TODO: investigate if direct call of derived destructor gives any benefits. + this->~graph_task(); + allocator.deallocate(static_cast(this), ed); +} + +template +inline void graph_task::finalize(const execution_data& ed) { + graph& g = my_graph; + destruct_and_deallocate(ed); + g.release_wait(); +} + +//******************************************************************************** +// end of graph tasks helpers +//******************************************************************************** + + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET +class get_graph_helper; +#endif + +//! The base of all graph nodes. +class graph_node : no_copy { + friend class graph; + template + friend class graph_iterator; + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + friend class get_graph_helper; +#endif + +protected: + graph& my_graph; + graph& graph_reference() const { + // TODO revamp: propagate graph_reference() method to all the reference places. + return my_graph; + } + graph_node* next = nullptr; + graph_node* prev = nullptr; +public: + explicit graph_node(graph& g); + + virtual ~graph_node(); + +protected: + // performs the reset on an individual node. + virtual void reset_node(reset_flags f = rf_reset_protocol) = 0; +}; // class graph_node + +inline void activate_graph(graph& g) { + g.my_is_active = true; +} + +inline void deactivate_graph(graph& g) { + g.my_is_active = false; +} + +inline bool is_graph_active(graph& g) { + return g.my_is_active; +} + +inline graph_task* prioritize_task(graph& g, graph_task& gt) { + if( no_priority == gt.priority ) + return > + + //! Non-preemptive priority pattern. The original task is submitted as a work item to the + //! priority queue, and a new critical task is created to take and execute a work item with + //! the highest known priority. The reference counting responsibility is transferred (via + //! allocate_continuation) to the new task. + task* critical_task = gt.my_allocator.new_object(g.my_priority_queue, gt.my_allocator); + __TBB_ASSERT( critical_task, "bad_alloc?" ); + g.my_priority_queue.push(>); + using tbb::detail::d1::submit; + submit( *critical_task, *g.my_task_arena, *g.my_context, /*as_critical=*/true ); + return nullptr; +} + +//! Spawns a task inside graph arena +inline void spawn_in_graph_arena(graph& g, graph_task& arena_task) { + if (is_graph_active(g)) { + task* gt = prioritize_task(g, arena_task); + if( !gt ) + return; + + __TBB_ASSERT(g.my_task_arena && g.my_task_arena->is_active(), nullptr); + submit( *gt, *g.my_task_arena, *g.my_context +#if __TBB_PREVIEW_CRITICAL_TASKS + , /*as_critical=*/false +#endif + ); + } +} + +// TODO revamp: unify *_in_graph_arena functions + +//! Enqueues a task inside graph arena +inline void enqueue_in_graph_arena(graph &g, graph_task& arena_task) { + if (is_graph_active(g)) { + __TBB_ASSERT( g.my_task_arena && g.my_task_arena->is_active(), "Is graph's arena initialized and active?" ); + + // TODO revamp: decide on the approach that does not postpone critical task + if( task* gt = prioritize_task(g, arena_task) ) + submit( *gt, *g.my_task_arena, *g.my_context, /*as_critical=*/false); + } +} + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_flow_graph_impl_H diff --git a/third_party/tbb/detail/_flow_graph_indexer_impl.h b/third_party/tbb/detail/_flow_graph_indexer_impl.h new file mode 100644 index 000000000..fdb4f6ab6 --- /dev/null +++ b/third_party/tbb/detail/_flow_graph_indexer_impl.h @@ -0,0 +1,352 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB__flow_graph_indexer_impl_H +#define __TBB__flow_graph_indexer_impl_H + +#ifndef __TBB_flow_graph_H +#error Do not #include this internal file directly; use public TBB headers instead. +#endif + +// included in namespace tbb::detail::d1 + +#include "third_party/tbb/detail/_flow_graph_types_impl.h" + + // Output of the indexer_node is a tbb::flow::tagged_msg, and will be of + // the form tagged_msg + // where the value of tag will indicate which result was put to the + // successor. + + template + graph_task* do_try_put(const T &v, void *p) { + typename IndexerNodeBaseType::output_type o(K, v); + return reinterpret_cast(p)->try_put_task(&o); + } + + template + struct indexer_helper { + template + static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) { + typedef typename std::tuple_element::type T; + graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put; + std::get(my_input).set_up(p, indexer_node_put_task, g); + indexer_helper::template set_indexer_node_pointer(my_input, p, g); + } + }; + + template + struct indexer_helper { + template + static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) { + typedef typename std::tuple_element<0, TupleTypes>::type T; + graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put; + std::get<0>(my_input).set_up(p, indexer_node_put_task, g); + } + }; + + template + class indexer_input_port : public receiver { + private: + void* my_indexer_ptr; + typedef graph_task* (* forward_function_ptr)(T const &, void* ); + forward_function_ptr my_try_put_task; + graph* my_graph; + public: + void set_up(void* p, forward_function_ptr f, graph& g) { + my_indexer_ptr = p; + my_try_put_task = f; + my_graph = &g; + } + + protected: + template< typename R, typename B > friend class run_and_put_task; + template friend class broadcast_cache; + template friend class round_robin_cache; + graph_task* try_put_task(const T &v) override { + return my_try_put_task(v, my_indexer_ptr); + } + + graph& graph_reference() const override { + return *my_graph; + } + }; + + template + class indexer_node_FE { + public: + static const int N = std::tuple_size::value; + typedef OutputType output_type; + typedef InputTuple input_type; + + // Some versions of Intel(R) C++ Compiler fail to generate an implicit constructor for the class which has std::tuple as a member. + indexer_node_FE() : my_inputs() {} + + input_type &input_ports() { return my_inputs; } + protected: + input_type my_inputs; + }; + + //! indexer_node_base + template + class indexer_node_base : public graph_node, public indexer_node_FE, + public sender { + protected: + using graph_node::my_graph; + public: + static const size_t N = std::tuple_size::value; + typedef OutputType output_type; + typedef StructTypes tuple_types; + typedef typename sender::successor_type successor_type; + typedef indexer_node_FE input_ports_type; + + private: + // ----------- Aggregator ------------ + enum op_type { reg_succ, rem_succ, try__put_task + }; + typedef indexer_node_base class_type; + + class indexer_node_base_operation : public aggregated_operation { + public: + char type; + union { + output_type const *my_arg; + successor_type *my_succ; + graph_task* bypass_t; + }; + indexer_node_base_operation(const output_type* e, op_type t) : + type(char(t)), my_arg(e) {} + indexer_node_base_operation(const successor_type &s, op_type t) : type(char(t)), + my_succ(const_cast(&s)) {} + }; + + typedef aggregating_functor handler_type; + friend class aggregating_functor; + aggregator my_aggregator; + + void handle_operations(indexer_node_base_operation* op_list) { + indexer_node_base_operation *current; + while(op_list) { + current = op_list; + op_list = op_list->next; + switch(current->type) { + + case reg_succ: + my_successors.register_successor(*(current->my_succ)); + current->status.store( SUCCEEDED, std::memory_order_release); + break; + + case rem_succ: + my_successors.remove_successor(*(current->my_succ)); + current->status.store( SUCCEEDED, std::memory_order_release); + break; + case try__put_task: { + current->bypass_t = my_successors.try_put_task(*(current->my_arg)); + current->status.store( SUCCEEDED, std::memory_order_release); // return of try_put_task actual return value + } + break; + } + } + } + // ---------- end aggregator ----------- + public: + indexer_node_base(graph& g) : graph_node(g), input_ports_type(), my_successors(this) { + indexer_helper::set_indexer_node_pointer(this->my_inputs, this, g); + my_aggregator.initialize_handler(handler_type(this)); + } + + indexer_node_base(const indexer_node_base& other) + : graph_node(other.my_graph), input_ports_type(), sender(), my_successors(this) + { + indexer_helper::set_indexer_node_pointer(this->my_inputs, this, other.my_graph); + my_aggregator.initialize_handler(handler_type(this)); + } + + bool register_successor(successor_type &r) override { + indexer_node_base_operation op_data(r, reg_succ); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + bool remove_successor( successor_type &r) override { + indexer_node_base_operation op_data(r, rem_succ); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + graph_task* try_put_task(output_type const *v) { // not a virtual method in this class + indexer_node_base_operation op_data(v, try__put_task); + my_aggregator.execute(&op_data); + return op_data.bypass_t; + } + + protected: + void reset_node(reset_flags f) override { + if(f & rf_clear_edges) { + my_successors.clear(); + } + } + + private: + broadcast_cache my_successors; + }; //indexer_node_base + + + template struct input_types; + + template + struct input_types<1, InputTuple> { + typedef typename std::tuple_element<0, InputTuple>::type first_type; + typedef tagged_msg type; + }; + + template + struct input_types<2, InputTuple> { + typedef typename std::tuple_element<0, InputTuple>::type first_type; + typedef typename std::tuple_element<1, InputTuple>::type second_type; + typedef tagged_msg type; + }; + + template + struct input_types<3, InputTuple> { + typedef typename std::tuple_element<0, InputTuple>::type first_type; + typedef typename std::tuple_element<1, InputTuple>::type second_type; + typedef typename std::tuple_element<2, InputTuple>::type third_type; + typedef tagged_msg type; + }; + + template + struct input_types<4, InputTuple> { + typedef typename std::tuple_element<0, InputTuple>::type first_type; + typedef typename std::tuple_element<1, InputTuple>::type second_type; + typedef typename std::tuple_element<2, InputTuple>::type third_type; + typedef typename std::tuple_element<3, InputTuple>::type fourth_type; + typedef tagged_msg type; + }; + + template + struct input_types<5, InputTuple> { + typedef typename std::tuple_element<0, InputTuple>::type first_type; + typedef typename std::tuple_element<1, InputTuple>::type second_type; + typedef typename std::tuple_element<2, InputTuple>::type third_type; + typedef typename std::tuple_element<3, InputTuple>::type fourth_type; + typedef typename std::tuple_element<4, InputTuple>::type fifth_type; + typedef tagged_msg type; + }; + + template + struct input_types<6, InputTuple> { + typedef typename std::tuple_element<0, InputTuple>::type first_type; + typedef typename std::tuple_element<1, InputTuple>::type second_type; + typedef typename std::tuple_element<2, InputTuple>::type third_type; + typedef typename std::tuple_element<3, InputTuple>::type fourth_type; + typedef typename std::tuple_element<4, InputTuple>::type fifth_type; + typedef typename std::tuple_element<5, InputTuple>::type sixth_type; + typedef tagged_msg type; + }; + + template + struct input_types<7, InputTuple> { + typedef typename std::tuple_element<0, InputTuple>::type first_type; + typedef typename std::tuple_element<1, InputTuple>::type second_type; + typedef typename std::tuple_element<2, InputTuple>::type third_type; + typedef typename std::tuple_element<3, InputTuple>::type fourth_type; + typedef typename std::tuple_element<4, InputTuple>::type fifth_type; + typedef typename std::tuple_element<5, InputTuple>::type sixth_type; + typedef typename std::tuple_element<6, InputTuple>::type seventh_type; + typedef tagged_msg type; + }; + + + template + struct input_types<8, InputTuple> { + typedef typename std::tuple_element<0, InputTuple>::type first_type; + typedef typename std::tuple_element<1, InputTuple>::type second_type; + typedef typename std::tuple_element<2, InputTuple>::type third_type; + typedef typename std::tuple_element<3, InputTuple>::type fourth_type; + typedef typename std::tuple_element<4, InputTuple>::type fifth_type; + typedef typename std::tuple_element<5, InputTuple>::type sixth_type; + typedef typename std::tuple_element<6, InputTuple>::type seventh_type; + typedef typename std::tuple_element<7, InputTuple>::type eighth_type; + typedef tagged_msg type; + }; + + + template + struct input_types<9, InputTuple> { + typedef typename std::tuple_element<0, InputTuple>::type first_type; + typedef typename std::tuple_element<1, InputTuple>::type second_type; + typedef typename std::tuple_element<2, InputTuple>::type third_type; + typedef typename std::tuple_element<3, InputTuple>::type fourth_type; + typedef typename std::tuple_element<4, InputTuple>::type fifth_type; + typedef typename std::tuple_element<5, InputTuple>::type sixth_type; + typedef typename std::tuple_element<6, InputTuple>::type seventh_type; + typedef typename std::tuple_element<7, InputTuple>::type eighth_type; + typedef typename std::tuple_element<8, InputTuple>::type nineth_type; + typedef tagged_msg type; + }; + + template + struct input_types<10, InputTuple> { + typedef typename std::tuple_element<0, InputTuple>::type first_type; + typedef typename std::tuple_element<1, InputTuple>::type second_type; + typedef typename std::tuple_element<2, InputTuple>::type third_type; + typedef typename std::tuple_element<3, InputTuple>::type fourth_type; + typedef typename std::tuple_element<4, InputTuple>::type fifth_type; + typedef typename std::tuple_element<5, InputTuple>::type sixth_type; + typedef typename std::tuple_element<6, InputTuple>::type seventh_type; + typedef typename std::tuple_element<7, InputTuple>::type eighth_type; + typedef typename std::tuple_element<8, InputTuple>::type nineth_type; + typedef typename std::tuple_element<9, InputTuple>::type tenth_type; + typedef tagged_msg type; + }; + + // type generators + template + struct indexer_types : public input_types::value, OutputTuple> { + static const int N = std::tuple_size::value; + typedef typename input_types::type output_type; + typedef typename wrap_tuple_elements::type input_ports_type; + typedef indexer_node_FE indexer_FE_type; + typedef indexer_node_base indexer_base_type; + }; + + template + class unfolded_indexer_node : public indexer_types::indexer_base_type { + public: + typedef typename indexer_types::input_ports_type input_ports_type; + typedef OutputTuple tuple_types; + typedef typename indexer_types::output_type output_type; + private: + typedef typename indexer_types::indexer_base_type base_type; + public: + unfolded_indexer_node(graph& g) : base_type(g) {} + unfolded_indexer_node(const unfolded_indexer_node &other) : base_type(other) {} + }; + +#endif /* __TBB__flow_graph_indexer_impl_H */ diff --git a/third_party/tbb/detail/_flow_graph_item_buffer_impl.h b/third_party/tbb/detail/_flow_graph_item_buffer_impl.h new file mode 100644 index 000000000..6cce8b62c --- /dev/null +++ b/third_party/tbb/detail/_flow_graph_item_buffer_impl.h @@ -0,0 +1,280 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB__flow_graph_item_buffer_impl_H +#define __TBB__flow_graph_item_buffer_impl_H + +#ifndef __TBB_flow_graph_H +#error Do not #include this internal file directly; use public TBB headers instead. +#endif + +#include "third_party/tbb/detail/_aligned_space.h" + +// in namespace tbb::flow::interfaceX (included in _flow_graph_node_impl.h) + +//! Expandable buffer of items. The possible operations are push, pop, +//* tests for empty and so forth. No mutual exclusion is built in. +//* objects are constructed into and explicitly-destroyed. get_my_item gives +// a read-only reference to the item in the buffer. set_my_item may be called +// with either an empty or occupied slot. + +template > +class item_buffer { +public: + typedef T item_type; + enum buffer_item_state { no_item=0, has_item=1, reserved_item=2 }; +protected: + typedef size_t size_type; + typedef std::pair aligned_space_item; + typedef aligned_space buffer_item_type; + typedef typename allocator_traits::template rebind_alloc allocator_type; + buffer_item_type *my_array; + size_type my_array_size; + static const size_type initial_buffer_size = 4; + size_type my_head; + size_type my_tail; + + bool buffer_empty() const { return my_head == my_tail; } + + aligned_space_item &item(size_type i) { + __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of::value), nullptr); + __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of::value), nullptr); + return *my_array[i & (my_array_size - 1) ].begin(); + } + + const aligned_space_item &item(size_type i) const { + __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of::value), nullptr); + __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of::value), nullptr); + return *my_array[i & (my_array_size-1)].begin(); + } + + bool my_item_valid(size_type i) const { return (i < my_tail) && (i >= my_head) && (item(i).second != no_item); } +#if TBB_USE_ASSERT + bool my_item_reserved(size_type i) const { return item(i).second == reserved_item; } +#endif + + // object management in buffer + const item_type &get_my_item(size_t i) const { + __TBB_ASSERT(my_item_valid(i),"attempt to get invalid item"); + item_type* itm = const_cast(reinterpret_cast(&item(i).first)); + return *itm; + } + + // may be called with an empty slot or a slot that has already been constructed into. + void set_my_item(size_t i, const item_type &o) { + if(item(i).second != no_item) { + destroy_item(i); + } + new(&(item(i).first)) item_type(o); + item(i).second = has_item; + } + + // destructively-fetch an object from the buffer + void fetch_item(size_t i, item_type &o) { + __TBB_ASSERT(my_item_valid(i), "Trying to fetch an empty slot"); + o = get_my_item(i); // could have std::move assign semantics + destroy_item(i); + } + + // move an existing item from one slot to another. The moved-to slot must be unoccupied, + // the moved-from slot must exist and not be reserved. The after, from will be empty, + // to will be occupied but not reserved + void move_item(size_t to, size_t from) { + __TBB_ASSERT(!my_item_valid(to), "Trying to move to a non-empty slot"); + __TBB_ASSERT(my_item_valid(from), "Trying to move from an empty slot"); + set_my_item(to, get_my_item(from)); // could have std::move semantics + destroy_item(from); + + } + + // put an item in an empty slot. Return true if successful, else false + bool place_item(size_t here, const item_type &me) { +#if !TBB_DEPRECATED_SEQUENCER_DUPLICATES + if(my_item_valid(here)) return false; +#endif + set_my_item(here, me); + return true; + } + + // could be implemented with std::move semantics + void swap_items(size_t i, size_t j) { + __TBB_ASSERT(my_item_valid(i) && my_item_valid(j), "attempt to swap invalid item(s)"); + item_type temp = get_my_item(i); + set_my_item(i, get_my_item(j)); + set_my_item(j, temp); + } + + void destroy_item(size_type i) { + __TBB_ASSERT(my_item_valid(i), "destruction of invalid item"); + item(i).first.~item_type(); + item(i).second = no_item; + } + + // returns the front element + const item_type& front() const + { + __TBB_ASSERT(my_item_valid(my_head), "attempt to fetch head non-item"); + return get_my_item(my_head); + } + + // returns the back element + const item_type& back() const + { + __TBB_ASSERT(my_item_valid(my_tail - 1), "attempt to fetch head non-item"); + return get_my_item(my_tail - 1); + } + + // following methods are for reservation of the front of a buffer. + void reserve_item(size_type i) { __TBB_ASSERT(my_item_valid(i) && !my_item_reserved(i), "item cannot be reserved"); item(i).second = reserved_item; } + void release_item(size_type i) { __TBB_ASSERT(my_item_reserved(i), "item is not reserved"); item(i).second = has_item; } + + void destroy_front() { destroy_item(my_head); ++my_head; } + void destroy_back() { destroy_item(my_tail-1); --my_tail; } + + // we have to be able to test against a new tail value without changing my_tail + // grow_array doesn't work if we change my_tail when the old array is too small + size_type size(size_t new_tail = 0) { return (new_tail ? new_tail : my_tail) - my_head; } + size_type capacity() { return my_array_size; } + // sequencer_node does not use this method, so we don't + // need a version that passes in the new_tail value. + bool buffer_full() { return size() >= capacity(); } + + //! Grows the internal array. + void grow_my_array( size_t minimum_size ) { + // test that we haven't made the structure inconsistent. + __TBB_ASSERT(capacity() >= my_tail - my_head, "total items exceed capacity"); + size_type new_size = my_array_size ? 2*my_array_size : initial_buffer_size; + while( new_sizesecond = no_item; } + + for( size_type i=my_head; ifirst); + (void)new(new_space) item_type(get_my_item(i)); + new_array[i&(new_size-1)].begin()->second = item(i).second; + } + } + + clean_up_buffer(/*reset_pointers*/false); + + my_array = new_array; + my_array_size = new_size; + } + + bool push_back(item_type &v) { + if(buffer_full()) { + grow_my_array(size() + 1); + } + set_my_item(my_tail, v); + ++my_tail; + return true; + } + + bool pop_back(item_type &v) { + if (!my_item_valid(my_tail-1)) { + return false; + } + v = this->back(); + destroy_back(); + return true; + } + + bool pop_front(item_type &v) { + if(!my_item_valid(my_head)) { + return false; + } + v = this->front(); + destroy_front(); + return true; + } + + // This is used both for reset and for grow_my_array. In the case of grow_my_array + // we want to retain the values of the head and tail. + void clean_up_buffer(bool reset_pointers) { + if (my_array) { + for( size_type i=my_head; i > +class reservable_item_buffer : public item_buffer { +protected: + using item_buffer::my_item_valid; + using item_buffer::my_head; + +public: + reservable_item_buffer() : item_buffer(), my_reserved(false) {} + void reset() {my_reserved = false; item_buffer::reset(); } +protected: + + bool reserve_front(T &v) { + if(my_reserved || !my_item_valid(this->my_head)) return false; + my_reserved = true; + // reserving the head + v = this->front(); + this->reserve_item(this->my_head); + return true; + } + + void consume_front() { + __TBB_ASSERT(my_reserved, "Attempt to consume a non-reserved item"); + this->destroy_front(); + my_reserved = false; + } + + void release_front() { + __TBB_ASSERT(my_reserved, "Attempt to release a non-reserved item"); + this->release_item(this->my_head); + my_reserved = false; + } + + bool my_reserved; +}; + +#endif // __TBB__flow_graph_item_buffer_impl_H diff --git a/third_party/tbb/detail/_flow_graph_join_impl.h b/third_party/tbb/detail/_flow_graph_join_impl.h new file mode 100644 index 000000000..1253a0662 --- /dev/null +++ b/third_party/tbb/detail/_flow_graph_join_impl.h @@ -0,0 +1,1709 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB__flow_graph_join_impl_H +#define __TBB__flow_graph_join_impl_H + +#ifndef __TBB_flow_graph_H +#error Do not #include this internal file directly; use public TBB headers instead. +#endif + +// included into namespace tbb::detail::d1 + + struct forwarding_base : no_assign { + forwarding_base(graph &g) : graph_ref(g) {} + virtual ~forwarding_base() {} + graph& graph_ref; + }; + + struct queueing_forwarding_base : forwarding_base { + using forwarding_base::forwarding_base; + // decrement_port_count may create a forwarding task. If we cannot handle the task + // ourselves, ask decrement_port_count to deal with it. + virtual graph_task* decrement_port_count(bool handle_task) = 0; + }; + + struct reserving_forwarding_base : forwarding_base { + using forwarding_base::forwarding_base; + // decrement_port_count may create a forwarding task. If we cannot handle the task + // ourselves, ask decrement_port_count to deal with it. + virtual graph_task* decrement_port_count() = 0; + virtual void increment_port_count() = 0; + }; + + // specialization that lets us keep a copy of the current_key for building results. + // KeyType can be a reference type. + template + struct matching_forwarding_base : public forwarding_base { + typedef typename std::decay::type current_key_type; + matching_forwarding_base(graph &g) : forwarding_base(g) { } + virtual graph_task* increment_key_count(current_key_type const & /*t*/) = 0; + current_key_type current_key; // so ports can refer to FE's desired items + }; + + template< int N > + struct join_helper { + + template< typename TupleType, typename PortType > + static inline void set_join_node_pointer(TupleType &my_input, PortType *port) { + std::get( my_input ).set_join_node_pointer(port); + join_helper::set_join_node_pointer( my_input, port ); + } + template< typename TupleType > + static inline void consume_reservations( TupleType &my_input ) { + std::get( my_input ).consume(); + join_helper::consume_reservations( my_input ); + } + + template< typename TupleType > + static inline void release_my_reservation( TupleType &my_input ) { + std::get( my_input ).release(); + } + + template + static inline void release_reservations( TupleType &my_input) { + join_helper::release_reservations(my_input); + release_my_reservation(my_input); + } + + template< typename InputTuple, typename OutputTuple > + static inline bool reserve( InputTuple &my_input, OutputTuple &out) { + if ( !std::get( my_input ).reserve( std::get( out ) ) ) return false; + if ( !join_helper::reserve( my_input, out ) ) { + release_my_reservation( my_input ); + return false; + } + return true; + } + + template + static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) { + bool res = std::get(my_input).get_item(std::get(out) ); // may fail + return join_helper::get_my_item(my_input, out) && res; // do get on other inputs before returning + } + + template + static inline bool get_items(InputTuple &my_input, OutputTuple &out) { + return get_my_item(my_input, out); + } + + template + static inline void reset_my_port(InputTuple &my_input) { + join_helper::reset_my_port(my_input); + std::get(my_input).reset_port(); + } + + template + static inline void reset_ports(InputTuple& my_input) { + reset_my_port(my_input); + } + + template + static inline void set_key_functors(InputTuple &my_input, KeyFuncTuple &my_key_funcs) { + std::get(my_input).set_my_key_func(std::get(my_key_funcs)); + std::get(my_key_funcs) = nullptr; + join_helper::set_key_functors(my_input, my_key_funcs); + } + + template< typename KeyFuncTuple> + static inline void copy_key_functors(KeyFuncTuple &my_inputs, KeyFuncTuple &other_inputs) { + __TBB_ASSERT( + std::get(other_inputs).get_my_key_func(), + "key matching join node should not be instantiated without functors." + ); + std::get(my_inputs).set_my_key_func(std::get(other_inputs).get_my_key_func()->clone()); + join_helper::copy_key_functors(my_inputs, other_inputs); + } + + template + static inline void reset_inputs(InputTuple &my_input, reset_flags f) { + join_helper::reset_inputs(my_input, f); + std::get(my_input).reset_receiver(f); + } + }; // join_helper + + template< > + struct join_helper<1> { + + template< typename TupleType, typename PortType > + static inline void set_join_node_pointer(TupleType &my_input, PortType *port) { + std::get<0>( my_input ).set_join_node_pointer(port); + } + + template< typename TupleType > + static inline void consume_reservations( TupleType &my_input ) { + std::get<0>( my_input ).consume(); + } + + template< typename TupleType > + static inline void release_my_reservation( TupleType &my_input ) { + std::get<0>( my_input ).release(); + } + + template + static inline void release_reservations( TupleType &my_input) { + release_my_reservation(my_input); + } + + template< typename InputTuple, typename OutputTuple > + static inline bool reserve( InputTuple &my_input, OutputTuple &out) { + return std::get<0>( my_input ).reserve( std::get<0>( out ) ); + } + + template + static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) { + return std::get<0>(my_input).get_item(std::get<0>(out)); + } + + template + static inline bool get_items(InputTuple &my_input, OutputTuple &out) { + return get_my_item(my_input, out); + } + + template + static inline void reset_my_port(InputTuple &my_input) { + std::get<0>(my_input).reset_port(); + } + + template + static inline void reset_ports(InputTuple& my_input) { + reset_my_port(my_input); + } + + template + static inline void set_key_functors(InputTuple &my_input, KeyFuncTuple &my_key_funcs) { + std::get<0>(my_input).set_my_key_func(std::get<0>(my_key_funcs)); + std::get<0>(my_key_funcs) = nullptr; + } + + template< typename KeyFuncTuple> + static inline void copy_key_functors(KeyFuncTuple &my_inputs, KeyFuncTuple &other_inputs) { + __TBB_ASSERT( + std::get<0>(other_inputs).get_my_key_func(), + "key matching join node should not be instantiated without functors." + ); + std::get<0>(my_inputs).set_my_key_func(std::get<0>(other_inputs).get_my_key_func()->clone()); + } + template + static inline void reset_inputs(InputTuple &my_input, reset_flags f) { + std::get<0>(my_input).reset_receiver(f); + } + }; // join_helper<1> + + //! The two-phase join port + template< typename T > + class reserving_port : public receiver { + public: + typedef T input_type; + typedef typename receiver::predecessor_type predecessor_type; + + private: + // ----------- Aggregator ------------ + enum op_type { reg_pred, rem_pred, res_item, rel_res, con_res + }; + typedef reserving_port class_type; + + class reserving_port_operation : public aggregated_operation { + public: + char type; + union { + T *my_arg; + predecessor_type *my_pred; + }; + reserving_port_operation(const T& e, op_type t) : + type(char(t)), my_arg(const_cast(&e)) {} + reserving_port_operation(const predecessor_type &s, op_type t) : type(char(t)), + my_pred(const_cast(&s)) {} + reserving_port_operation(op_type t) : type(char(t)) {} + }; + + typedef aggregating_functor handler_type; + friend class aggregating_functor; + aggregator my_aggregator; + + void handle_operations(reserving_port_operation* op_list) { + reserving_port_operation *current; + bool was_missing_predecessors = false; + while(op_list) { + current = op_list; + op_list = op_list->next; + switch(current->type) { + case reg_pred: + was_missing_predecessors = my_predecessors.empty(); + my_predecessors.add(*(current->my_pred)); + if ( was_missing_predecessors ) { + (void) my_join->decrement_port_count(); // may try to forward + } + current->status.store( SUCCEEDED, std::memory_order_release); + break; + case rem_pred: + if ( !my_predecessors.empty() ) { + my_predecessors.remove(*(current->my_pred)); + if ( my_predecessors.empty() ) // was the last predecessor + my_join->increment_port_count(); + } + // TODO: consider returning failure if there were no predecessors to remove + current->status.store( SUCCEEDED, std::memory_order_release ); + break; + case res_item: + if ( reserved ) { + current->status.store( FAILED, std::memory_order_release); + } + else if ( my_predecessors.try_reserve( *(current->my_arg) ) ) { + reserved = true; + current->status.store( SUCCEEDED, std::memory_order_release); + } else { + if ( my_predecessors.empty() ) { + my_join->increment_port_count(); + } + current->status.store( FAILED, std::memory_order_release); + } + break; + case rel_res: + reserved = false; + my_predecessors.try_release( ); + current->status.store( SUCCEEDED, std::memory_order_release); + break; + case con_res: + reserved = false; + my_predecessors.try_consume( ); + current->status.store( SUCCEEDED, std::memory_order_release); + break; + } + } + } + + protected: + template< typename R, typename B > friend class run_and_put_task; + template friend class broadcast_cache; + template friend class round_robin_cache; + graph_task* try_put_task( const T & ) override { + return nullptr; + } + + graph& graph_reference() const override { + return my_join->graph_ref; + } + + public: + + //! Constructor + reserving_port() : my_join(nullptr), my_predecessors(this), reserved(false) { + my_aggregator.initialize_handler(handler_type(this)); + } + + // copy constructor + reserving_port(const reserving_port& /* other */) = delete; + + void set_join_node_pointer(reserving_forwarding_base *join) { + my_join = join; + } + + //! Add a predecessor + bool register_predecessor( predecessor_type &src ) override { + reserving_port_operation op_data(src, reg_pred); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + //! Remove a predecessor + bool remove_predecessor( predecessor_type &src ) override { + reserving_port_operation op_data(src, rem_pred); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + //! Reserve an item from the port + bool reserve( T &v ) { + reserving_port_operation op_data(v, res_item); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + //! Release the port + void release( ) { + reserving_port_operation op_data(rel_res); + my_aggregator.execute(&op_data); + } + + //! Complete use of the port + void consume( ) { + reserving_port_operation op_data(con_res); + my_aggregator.execute(&op_data); + } + + void reset_receiver( reset_flags f) { + if(f & rf_clear_edges) my_predecessors.clear(); + else + my_predecessors.reset(); + reserved = false; + __TBB_ASSERT(!(f&rf_clear_edges) || my_predecessors.empty(), "port edges not removed"); + } + + private: +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + friend class get_graph_helper; +#endif + + reserving_forwarding_base *my_join; + reservable_predecessor_cache< T, null_mutex > my_predecessors; + bool reserved; + }; // reserving_port + + //! queueing join_port + template + class queueing_port : public receiver, public item_buffer { + public: + typedef T input_type; + typedef typename receiver::predecessor_type predecessor_type; + typedef queueing_port class_type; + + // ----------- Aggregator ------------ + private: + enum op_type { get__item, res_port, try__put_task + }; + + class queueing_port_operation : public aggregated_operation { + public: + char type; + T my_val; + T* my_arg; + graph_task* bypass_t; + // constructor for value parameter + queueing_port_operation(const T& e, op_type t) : + type(char(t)), my_val(e), my_arg(nullptr) + , bypass_t(nullptr) + {} + // constructor for pointer parameter + queueing_port_operation(const T* p, op_type t) : + type(char(t)), my_arg(const_cast(p)) + , bypass_t(nullptr) + {} + // constructor with no parameter + queueing_port_operation(op_type t) : type(char(t)), my_arg(nullptr) + , bypass_t(nullptr) + {} + }; + + typedef aggregating_functor handler_type; + friend class aggregating_functor; + aggregator my_aggregator; + + void handle_operations(queueing_port_operation* op_list) { + queueing_port_operation *current; + bool was_empty; + while(op_list) { + current = op_list; + op_list = op_list->next; + switch(current->type) { + case try__put_task: { + graph_task* rtask = nullptr; + was_empty = this->buffer_empty(); + this->push_back(current->my_val); + if (was_empty) rtask = my_join->decrement_port_count(false); + else + rtask = SUCCESSFULLY_ENQUEUED; + current->bypass_t = rtask; + current->status.store( SUCCEEDED, std::memory_order_release); + } + break; + case get__item: + if(!this->buffer_empty()) { + __TBB_ASSERT(current->my_arg, nullptr); + *(current->my_arg) = this->front(); + current->status.store( SUCCEEDED, std::memory_order_release); + } + else { + current->status.store( FAILED, std::memory_order_release); + } + break; + case res_port: + __TBB_ASSERT(this->my_item_valid(this->my_head), "No item to reset"); + this->destroy_front(); + if(this->my_item_valid(this->my_head)) { + (void)my_join->decrement_port_count(true); + } + current->status.store( SUCCEEDED, std::memory_order_release); + break; + } + } + } + // ------------ End Aggregator --------------- + + protected: + template< typename R, typename B > friend class run_and_put_task; + template friend class broadcast_cache; + template friend class round_robin_cache; + graph_task* try_put_task(const T &v) override { + queueing_port_operation op_data(v, try__put_task); + my_aggregator.execute(&op_data); + __TBB_ASSERT(op_data.status == SUCCEEDED || !op_data.bypass_t, "inconsistent return from aggregator"); + if(!op_data.bypass_t) return SUCCESSFULLY_ENQUEUED; + return op_data.bypass_t; + } + + graph& graph_reference() const override { + return my_join->graph_ref; + } + + public: + + //! Constructor + queueing_port() : item_buffer() { + my_join = nullptr; + my_aggregator.initialize_handler(handler_type(this)); + } + + //! copy constructor + queueing_port(const queueing_port& /* other */) = delete; + + //! record parent for tallying available items + void set_join_node_pointer(queueing_forwarding_base *join) { + my_join = join; + } + + bool get_item( T &v ) { + queueing_port_operation op_data(&v, get__item); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + // reset_port is called when item is accepted by successor, but + // is initiated by join_node. + void reset_port() { + queueing_port_operation op_data(res_port); + my_aggregator.execute(&op_data); + return; + } + + void reset_receiver(reset_flags) { + item_buffer::reset(); + } + + private: +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + friend class get_graph_helper; +#endif + + queueing_forwarding_base *my_join; + }; // queueing_port + +#include "third_party/tbb/detail/_flow_graph_tagged_buffer_impl.h" + + template + struct count_element { + K my_key; + size_t my_value; + }; + + // method to access the key in the counting table + // the ref has already been removed from K + template< typename K > + struct key_to_count_functor { + typedef count_element table_item_type; + const K& operator()(const table_item_type& v) { return v.my_key; } + }; + + // the ports can have only one template parameter. We wrap the types needed in + // a traits type + template< class TraitsType > + class key_matching_port : + public receiver, + public hash_buffer< typename TraitsType::K, typename TraitsType::T, typename TraitsType::TtoK, + typename TraitsType::KHash > { + public: + typedef TraitsType traits; + typedef key_matching_port class_type; + typedef typename TraitsType::T input_type; + typedef typename TraitsType::K key_type; + typedef typename std::decay::type noref_key_type; + typedef typename receiver::predecessor_type predecessor_type; + typedef typename TraitsType::TtoK type_to_key_func_type; + typedef typename TraitsType::KHash hash_compare_type; + typedef hash_buffer< key_type, input_type, type_to_key_func_type, hash_compare_type > buffer_type; + + private: +// ----------- Aggregator ------------ + private: + enum op_type { try__put, get__item, res_port + }; + + class key_matching_port_operation : public aggregated_operation { + public: + char type; + input_type my_val; + input_type *my_arg; + // constructor for value parameter + key_matching_port_operation(const input_type& e, op_type t) : + type(char(t)), my_val(e), my_arg(nullptr) {} + // constructor for pointer parameter + key_matching_port_operation(const input_type* p, op_type t) : + type(char(t)), my_arg(const_cast(p)) {} + // constructor with no parameter + key_matching_port_operation(op_type t) : type(char(t)), my_arg(nullptr) {} + }; + + typedef aggregating_functor handler_type; + friend class aggregating_functor; + aggregator my_aggregator; + + void handle_operations(key_matching_port_operation* op_list) { + key_matching_port_operation *current; + while(op_list) { + current = op_list; + op_list = op_list->next; + switch(current->type) { + case try__put: { + bool was_inserted = this->insert_with_key(current->my_val); + // return failure if a duplicate insertion occurs + current->status.store( was_inserted ? SUCCEEDED : FAILED, std::memory_order_release); + } + break; + case get__item: + // use current_key from FE for item + __TBB_ASSERT(current->my_arg, nullptr); + if(!this->find_with_key(my_join->current_key, *(current->my_arg))) { + __TBB_ASSERT(false, "Failed to find item corresponding to current_key."); + } + current->status.store( SUCCEEDED, std::memory_order_release); + break; + case res_port: + // use current_key from FE for item + this->delete_with_key(my_join->current_key); + current->status.store( SUCCEEDED, std::memory_order_release); + break; + } + } + } +// ------------ End Aggregator --------------- + protected: + template< typename R, typename B > friend class run_and_put_task; + template friend class broadcast_cache; + template friend class round_robin_cache; + graph_task* try_put_task(const input_type& v) override { + key_matching_port_operation op_data(v, try__put); + graph_task* rtask = nullptr; + my_aggregator.execute(&op_data); + if(op_data.status == SUCCEEDED) { + rtask = my_join->increment_key_count((*(this->get_key_func()))(v)); // may spawn + // rtask has to reflect the return status of the try_put + if(!rtask) rtask = SUCCESSFULLY_ENQUEUED; + } + return rtask; + } + + graph& graph_reference() const override { + return my_join->graph_ref; + } + + public: + + key_matching_port() : receiver(), buffer_type() { + my_join = nullptr; + my_aggregator.initialize_handler(handler_type(this)); + } + + // copy constructor + key_matching_port(const key_matching_port& /*other*/) = delete; +#if __INTEL_COMPILER <= 2021 + // Suppress superfluous diagnostic about virtual keyword absence in a destructor of an inherited + // class while the parent class has the virtual keyword for the destrocutor. + virtual +#endif + ~key_matching_port() { } + + void set_join_node_pointer(forwarding_base *join) { + my_join = dynamic_cast*>(join); + } + + void set_my_key_func(type_to_key_func_type *f) { this->set_key_func(f); } + + type_to_key_func_type* get_my_key_func() { return this->get_key_func(); } + + bool get_item( input_type &v ) { + // aggregator uses current_key from FE for Key + key_matching_port_operation op_data(&v, get__item); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + // reset_port is called when item is accepted by successor, but + // is initiated by join_node. + void reset_port() { + key_matching_port_operation op_data(res_port); + my_aggregator.execute(&op_data); + return; + } + + void reset_receiver(reset_flags ) { + buffer_type::reset(); + } + + private: + // my_join forwarding base used to count number of inputs that + // received key. + matching_forwarding_base *my_join; + }; // key_matching_port + + using namespace graph_policy_namespace; + + template + class join_node_base; + + //! join_node_FE : implements input port policy + template + class join_node_FE; + + template + class join_node_FE : public reserving_forwarding_base { + private: + static const int N = std::tuple_size::value; + typedef OutputTuple output_type; + typedef InputTuple input_type; + typedef join_node_base base_node_type; // for forwarding + public: + join_node_FE(graph &g) : reserving_forwarding_base(g), my_node(nullptr) { + ports_with_no_inputs = N; + join_helper::set_join_node_pointer(my_inputs, this); + } + + join_node_FE(const join_node_FE& other) : reserving_forwarding_base((other.reserving_forwarding_base::graph_ref)), my_node(nullptr) { + ports_with_no_inputs = N; + join_helper::set_join_node_pointer(my_inputs, this); + } + + void set_my_node(base_node_type *new_my_node) { my_node = new_my_node; } + + void increment_port_count() override { + ++ports_with_no_inputs; + } + + // if all input_ports have predecessors, spawn forward to try and consume tuples + graph_task* decrement_port_count() override { + if(ports_with_no_inputs.fetch_sub(1) == 1) { + if(is_graph_active(this->graph_ref)) { + small_object_allocator allocator{}; + typedef forward_task_bypass task_type; + graph_task* t = allocator.new_object(graph_ref, allocator, *my_node); + graph_ref.reserve_wait(); + spawn_in_graph_arena(this->graph_ref, *t); + } + } + return nullptr; + } + + input_type &input_ports() { return my_inputs; } + + protected: + + void reset( reset_flags f) { + // called outside of parallel contexts + ports_with_no_inputs = N; + join_helper::reset_inputs(my_inputs, f); + } + + // all methods on input ports should be called under mutual exclusion from join_node_base. + + bool tuple_build_may_succeed() { + return !ports_with_no_inputs; + } + + bool try_to_make_tuple(output_type &out) { + if(ports_with_no_inputs) return false; + return join_helper::reserve(my_inputs, out); + } + + void tuple_accepted() { + join_helper::consume_reservations(my_inputs); + } + void tuple_rejected() { + join_helper::release_reservations(my_inputs); + } + + input_type my_inputs; + base_node_type *my_node; + std::atomic ports_with_no_inputs; + }; // join_node_FE + + template + class join_node_FE : public queueing_forwarding_base { + public: + static const int N = std::tuple_size::value; + typedef OutputTuple output_type; + typedef InputTuple input_type; + typedef join_node_base base_node_type; // for forwarding + + join_node_FE(graph &g) : queueing_forwarding_base(g), my_node(nullptr) { + ports_with_no_items = N; + join_helper::set_join_node_pointer(my_inputs, this); + } + + join_node_FE(const join_node_FE& other) : queueing_forwarding_base((other.queueing_forwarding_base::graph_ref)), my_node(nullptr) { + ports_with_no_items = N; + join_helper::set_join_node_pointer(my_inputs, this); + } + + // needed for forwarding + void set_my_node(base_node_type *new_my_node) { my_node = new_my_node; } + + void reset_port_count() { + ports_with_no_items = N; + } + + // if all input_ports have items, spawn forward to try and consume tuples + graph_task* decrement_port_count(bool handle_task) override + { + if(ports_with_no_items.fetch_sub(1) == 1) { + if(is_graph_active(this->graph_ref)) { + small_object_allocator allocator{}; + typedef forward_task_bypass task_type; + graph_task* t = allocator.new_object(graph_ref, allocator, *my_node); + graph_ref.reserve_wait(); + if( !handle_task ) + return t; + spawn_in_graph_arena(this->graph_ref, *t); + } + } + return nullptr; + } + + input_type &input_ports() { return my_inputs; } + + protected: + + void reset( reset_flags f) { + reset_port_count(); + join_helper::reset_inputs(my_inputs, f ); + } + + // all methods on input ports should be called under mutual exclusion from join_node_base. + + bool tuple_build_may_succeed() { + return !ports_with_no_items; + } + + bool try_to_make_tuple(output_type &out) { + if(ports_with_no_items) return false; + return join_helper::get_items(my_inputs, out); + } + + void tuple_accepted() { + reset_port_count(); + join_helper::reset_ports(my_inputs); + } + void tuple_rejected() { + // nothing to do. + } + + input_type my_inputs; + base_node_type *my_node; + std::atomic ports_with_no_items; + }; // join_node_FE + + // key_matching join front-end. + template + class join_node_FE, InputTuple, OutputTuple> : public matching_forwarding_base, + // buffer of key value counts + public hash_buffer< // typedefed below to key_to_count_buffer_type + typename std::decay::type&, // force ref type on K + count_element::type>, + type_to_key_function_body< + count_element::type>, + typename std::decay::type& >, + KHash >, + // buffer of output items + public item_buffer { + public: + static const int N = std::tuple_size::value; + typedef OutputTuple output_type; + typedef InputTuple input_type; + typedef K key_type; + typedef typename std::decay::type unref_key_type; + typedef KHash key_hash_compare; + // must use K without ref. + typedef count_element count_element_type; + // method that lets us refer to the key of this type. + typedef key_to_count_functor key_to_count_func; + typedef type_to_key_function_body< count_element_type, unref_key_type&> TtoK_function_body_type; + typedef type_to_key_function_body_leaf TtoK_function_body_leaf_type; + // this is the type of the special table that keeps track of the number of discrete + // elements corresponding to each key that we've seen. + typedef hash_buffer< unref_key_type&, count_element_type, TtoK_function_body_type, key_hash_compare > + key_to_count_buffer_type; + typedef item_buffer output_buffer_type; + typedef join_node_base, InputTuple, OutputTuple> base_node_type; // for forwarding + typedef matching_forwarding_base forwarding_base_type; + +// ----------- Aggregator ------------ + // the aggregator is only needed to serialize the access to the hash table. + // and the output_buffer_type base class + private: + enum op_type { res_count, inc_count, may_succeed, try_make }; + typedef join_node_FE, InputTuple, OutputTuple> class_type; + + class key_matching_FE_operation : public aggregated_operation { + public: + char type; + unref_key_type my_val; + output_type* my_output; + graph_task* bypass_t; + // constructor for value parameter + key_matching_FE_operation(const unref_key_type& e , op_type t) : type(char(t)), my_val(e), + my_output(nullptr), bypass_t(nullptr) {} + key_matching_FE_operation(output_type *p, op_type t) : type(char(t)), my_output(p), bypass_t(nullptr) {} + // constructor with no parameter + key_matching_FE_operation(op_type t) : type(char(t)), my_output(nullptr), bypass_t(nullptr) {} + }; + + typedef aggregating_functor handler_type; + friend class aggregating_functor; + aggregator my_aggregator; + + // called from aggregator, so serialized + // returns a task pointer if the a task would have been enqueued but we asked that + // it be returned. Otherwise returns nullptr. + graph_task* fill_output_buffer(unref_key_type &t) { + output_type l_out; + graph_task* rtask = nullptr; + bool do_fwd = this->buffer_empty() && is_graph_active(this->graph_ref); + this->current_key = t; + this->delete_with_key(this->current_key); // remove the key + if(join_helper::get_items(my_inputs, l_out)) { // <== call back + this->push_back(l_out); + if(do_fwd) { // we enqueue if receiving an item from predecessor, not if successor asks for item + small_object_allocator allocator{}; + typedef forward_task_bypass task_type; + rtask = allocator.new_object(this->graph_ref, allocator, *my_node); + this->graph_ref.reserve_wait(); + do_fwd = false; + } + // retire the input values + join_helper::reset_ports(my_inputs); // <== call back + } + else { + __TBB_ASSERT(false, "should have had something to push"); + } + return rtask; + } + + void handle_operations(key_matching_FE_operation* op_list) { + key_matching_FE_operation *current; + while(op_list) { + current = op_list; + op_list = op_list->next; + switch(current->type) { + case res_count: // called from BE + { + this->destroy_front(); + current->status.store( SUCCEEDED, std::memory_order_release); + } + break; + case inc_count: { // called from input ports + count_element_type *p = nullptr; + unref_key_type &t = current->my_val; + if(!(this->find_ref_with_key(t,p))) { + count_element_type ev; + ev.my_key = t; + ev.my_value = 0; + this->insert_with_key(ev); + bool found = this->find_ref_with_key(t, p); + __TBB_ASSERT_EX(found, "should find key after inserting it"); + } + if(++(p->my_value) == size_t(N)) { + current->bypass_t = fill_output_buffer(t); + } + } + current->status.store( SUCCEEDED, std::memory_order_release); + break; + case may_succeed: // called from BE + current->status.store( this->buffer_empty() ? FAILED : SUCCEEDED, std::memory_order_release); + break; + case try_make: // called from BE + if(this->buffer_empty()) { + current->status.store( FAILED, std::memory_order_release); + } + else { + *(current->my_output) = this->front(); + current->status.store( SUCCEEDED, std::memory_order_release); + } + break; + } + } + } +// ------------ End Aggregator --------------- + + public: + template + join_node_FE(graph &g, FunctionTuple &TtoK_funcs) : forwarding_base_type(g), my_node(nullptr) { + join_helper::set_join_node_pointer(my_inputs, this); + join_helper::set_key_functors(my_inputs, TtoK_funcs); + my_aggregator.initialize_handler(handler_type(this)); + TtoK_function_body_type *cfb = new TtoK_function_body_leaf_type(key_to_count_func()); + this->set_key_func(cfb); + } + + join_node_FE(const join_node_FE& other) : forwarding_base_type((other.forwarding_base_type::graph_ref)), key_to_count_buffer_type(), + output_buffer_type() { + my_node = nullptr; + join_helper::set_join_node_pointer(my_inputs, this); + join_helper::copy_key_functors(my_inputs, const_cast(other.my_inputs)); + my_aggregator.initialize_handler(handler_type(this)); + TtoK_function_body_type *cfb = new TtoK_function_body_leaf_type(key_to_count_func()); + this->set_key_func(cfb); + } + + // needed for forwarding + void set_my_node(base_node_type *new_my_node) { my_node = new_my_node; } + + void reset_port_count() { // called from BE + key_matching_FE_operation op_data(res_count); + my_aggregator.execute(&op_data); + return; + } + + // if all input_ports have items, spawn forward to try and consume tuples + // return a task if we are asked and did create one. + graph_task *increment_key_count(unref_key_type const & t) override { // called from input_ports + key_matching_FE_operation op_data(t, inc_count); + my_aggregator.execute(&op_data); + return op_data.bypass_t; + } + + input_type &input_ports() { return my_inputs; } + + protected: + + void reset( reset_flags f ) { + // called outside of parallel contexts + join_helper::reset_inputs(my_inputs, f); + + key_to_count_buffer_type::reset(); + output_buffer_type::reset(); + } + + // all methods on input ports should be called under mutual exclusion from join_node_base. + + bool tuple_build_may_succeed() { // called from back-end + key_matching_FE_operation op_data(may_succeed); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + // cannot lock while calling back to input_ports. current_key will only be set + // and reset under the aggregator, so it will remain consistent. + bool try_to_make_tuple(output_type &out) { + key_matching_FE_operation op_data(&out,try_make); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + void tuple_accepted() { + reset_port_count(); // reset current_key after ports reset. + } + + void tuple_rejected() { + // nothing to do. + } + + input_type my_inputs; // input ports + base_node_type *my_node; + }; // join_node_FE, InputTuple, OutputTuple> + + //! join_node_base + template + class join_node_base : public graph_node, public join_node_FE, + public sender { + protected: + using graph_node::my_graph; + public: + typedef OutputTuple output_type; + + typedef typename sender::successor_type successor_type; + typedef join_node_FE input_ports_type; + using input_ports_type::tuple_build_may_succeed; + using input_ports_type::try_to_make_tuple; + using input_ports_type::tuple_accepted; + using input_ports_type::tuple_rejected; + + private: + // ----------- Aggregator ------------ + enum op_type { reg_succ, rem_succ, try__get, do_fwrd, do_fwrd_bypass + }; + typedef join_node_base class_type; + + class join_node_base_operation : public aggregated_operation { + public: + char type; + union { + output_type *my_arg; + successor_type *my_succ; + }; + graph_task* bypass_t; + join_node_base_operation(const output_type& e, op_type t) : type(char(t)), + my_arg(const_cast(&e)), bypass_t(nullptr) {} + join_node_base_operation(const successor_type &s, op_type t) : type(char(t)), + my_succ(const_cast(&s)), bypass_t(nullptr) {} + join_node_base_operation(op_type t) : type(char(t)), bypass_t(nullptr) {} + }; + + typedef aggregating_functor handler_type; + friend class aggregating_functor; + bool forwarder_busy; + aggregator my_aggregator; + + void handle_operations(join_node_base_operation* op_list) { + join_node_base_operation *current; + while(op_list) { + current = op_list; + op_list = op_list->next; + switch(current->type) { + case reg_succ: { + my_successors.register_successor(*(current->my_succ)); + if(tuple_build_may_succeed() && !forwarder_busy && is_graph_active(my_graph)) { + small_object_allocator allocator{}; + typedef forward_task_bypass< join_node_base > task_type; + graph_task* t = allocator.new_object(my_graph, allocator, *this); + my_graph.reserve_wait(); + spawn_in_graph_arena(my_graph, *t); + forwarder_busy = true; + } + current->status.store( SUCCEEDED, std::memory_order_release); + } + break; + case rem_succ: + my_successors.remove_successor(*(current->my_succ)); + current->status.store( SUCCEEDED, std::memory_order_release); + break; + case try__get: + if(tuple_build_may_succeed()) { + if(try_to_make_tuple(*(current->my_arg))) { + tuple_accepted(); + current->status.store( SUCCEEDED, std::memory_order_release); + } + else current->status.store( FAILED, std::memory_order_release); + } + else current->status.store( FAILED, std::memory_order_release); + break; + case do_fwrd_bypass: { + bool build_succeeded; + graph_task *last_task = nullptr; + output_type out; + // forwarding must be exclusive, because try_to_make_tuple and tuple_accepted + // are separate locked methods in the FE. We could conceivably fetch the front + // of the FE queue, then be swapped out, have someone else consume the FE's + // object, then come back, forward, and then try to remove it from the queue + // again. Without reservation of the FE, the methods accessing it must be locked. + // We could remember the keys of the objects we forwarded, and then remove + // them from the input ports after forwarding is complete? + if(tuple_build_may_succeed()) { // checks output queue of FE + do { + build_succeeded = try_to_make_tuple(out); // fetch front_end of queue + if(build_succeeded) { + graph_task *new_task = my_successors.try_put_task(out); + last_task = combine_tasks(my_graph, last_task, new_task); + if(new_task) { + tuple_accepted(); + } + else { + tuple_rejected(); + build_succeeded = false; + } + } + } while(build_succeeded); + } + current->bypass_t = last_task; + current->status.store( SUCCEEDED, std::memory_order_release); + forwarder_busy = false; + } + break; + } + } + } + // ---------- end aggregator ----------- + public: + join_node_base(graph &g) + : graph_node(g), input_ports_type(g), forwarder_busy(false), my_successors(this) + { + input_ports_type::set_my_node(this); + my_aggregator.initialize_handler(handler_type(this)); + } + + join_node_base(const join_node_base& other) : + graph_node(other.graph_node::my_graph), input_ports_type(other), + sender(), forwarder_busy(false), my_successors(this) + { + input_ports_type::set_my_node(this); + my_aggregator.initialize_handler(handler_type(this)); + } + + template + join_node_base(graph &g, FunctionTuple f) + : graph_node(g), input_ports_type(g, f), forwarder_busy(false), my_successors(this) + { + input_ports_type::set_my_node(this); + my_aggregator.initialize_handler(handler_type(this)); + } + + bool register_successor(successor_type &r) override { + join_node_base_operation op_data(r, reg_succ); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + bool remove_successor( successor_type &r) override { + join_node_base_operation op_data(r, rem_succ); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + bool try_get( output_type &v) override { + join_node_base_operation op_data(v, try__get); + my_aggregator.execute(&op_data); + return op_data.status == SUCCEEDED; + } + + protected: + void reset_node(reset_flags f) override { + input_ports_type::reset(f); + if(f & rf_clear_edges) my_successors.clear(); + } + + private: + broadcast_cache my_successors; + + friend class forward_task_bypass< join_node_base >; + graph_task *forward_task() { + join_node_base_operation op_data(do_fwrd_bypass); + my_aggregator.execute(&op_data); + return op_data.bypass_t; + } + + }; // join_node_base + + // join base class type generator + template class PT, typename OutputTuple, typename JP> + struct join_base { + typedef join_node_base::type, OutputTuple> type; + }; + + template + struct join_base > { + typedef key_matching key_traits_type; + typedef K key_type; + typedef KHash key_hash_compare; + typedef join_node_base< key_traits_type, + // ports type + typename wrap_key_tuple_elements::type, + OutputTuple > type; + }; + + //! unfolded_join_node : passes input_ports_type to join_node_base. We build the input port type + // using tuple_element. The class PT is the port type (reserving_port, queueing_port, key_matching_port) + // and should match the typename. + + template class PT, typename OutputTuple, typename JP> + class unfolded_join_node : public join_base::type { + public: + typedef typename wrap_tuple_elements::type input_ports_type; + typedef OutputTuple output_type; + private: + typedef join_node_base base_type; + public: + unfolded_join_node(graph &g) : base_type(g) {} + unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} + }; + +#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING + template + struct key_from_message_body { + K operator()(const T& t) const { + return key_from_message(t); + } + }; + // Adds const to reference type + template + struct key_from_message_body { + const K& operator()(const T& t) const { + return key_from_message(t); + } + }; +#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ + // key_matching unfolded_join_node. This must be a separate specialization because the constructors + // differ. + + template + class unfolded_join_node<2,key_matching_port,OutputTuple,key_matching > : public + join_base<2,key_matching_port,OutputTuple,key_matching >::type { + typedef typename std::tuple_element<0, OutputTuple>::type T0; + typedef typename std::tuple_element<1, OutputTuple>::type T1; + public: + typedef typename wrap_key_tuple_elements<2,key_matching_port,key_matching,OutputTuple>::type input_ports_type; + typedef OutputTuple output_type; + private: + typedef join_node_base, input_ports_type, output_type > base_type; + typedef type_to_key_function_body *f0_p; + typedef type_to_key_function_body *f1_p; + typedef std::tuple< f0_p, f1_p > func_initializer_type; + public: +#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING + unfolded_join_node(graph &g) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()) + ) ) { + } +#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ + template + unfolded_join_node(graph &g, Body0 body0, Body1 body1) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf(body0), + new type_to_key_function_body_leaf(body1) + ) ) { + static_assert(std::tuple_size::value == 2, "wrong number of body initializers"); + } + unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} + }; + + template + class unfolded_join_node<3,key_matching_port,OutputTuple,key_matching > : public + join_base<3,key_matching_port,OutputTuple,key_matching >::type { + typedef typename std::tuple_element<0, OutputTuple>::type T0; + typedef typename std::tuple_element<1, OutputTuple>::type T1; + typedef typename std::tuple_element<2, OutputTuple>::type T2; + public: + typedef typename wrap_key_tuple_elements<3,key_matching_port,key_matching,OutputTuple>::type input_ports_type; + typedef OutputTuple output_type; + private: + typedef join_node_base, input_ports_type, output_type > base_type; + typedef type_to_key_function_body *f0_p; + typedef type_to_key_function_body *f1_p; + typedef type_to_key_function_body *f2_p; + typedef std::tuple< f0_p, f1_p, f2_p > func_initializer_type; + public: +#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING + unfolded_join_node(graph &g) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()) + ) ) { + } +#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ + template + unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf(body0), + new type_to_key_function_body_leaf(body1), + new type_to_key_function_body_leaf(body2) + ) ) { + static_assert(std::tuple_size::value == 3, "wrong number of body initializers"); + } + unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} + }; + + template + class unfolded_join_node<4,key_matching_port,OutputTuple,key_matching > : public + join_base<4,key_matching_port,OutputTuple,key_matching >::type { + typedef typename std::tuple_element<0, OutputTuple>::type T0; + typedef typename std::tuple_element<1, OutputTuple>::type T1; + typedef typename std::tuple_element<2, OutputTuple>::type T2; + typedef typename std::tuple_element<3, OutputTuple>::type T3; + public: + typedef typename wrap_key_tuple_elements<4,key_matching_port,key_matching,OutputTuple>::type input_ports_type; + typedef OutputTuple output_type; + private: + typedef join_node_base, input_ports_type, output_type > base_type; + typedef type_to_key_function_body *f0_p; + typedef type_to_key_function_body *f1_p; + typedef type_to_key_function_body *f2_p; + typedef type_to_key_function_body *f3_p; + typedef std::tuple< f0_p, f1_p, f2_p, f3_p > func_initializer_type; + public: +#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING + unfolded_join_node(graph &g) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()) + ) ) { + } +#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ + template + unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf(body0), + new type_to_key_function_body_leaf(body1), + new type_to_key_function_body_leaf(body2), + new type_to_key_function_body_leaf(body3) + ) ) { + static_assert(std::tuple_size::value == 4, "wrong number of body initializers"); + } + unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} + }; + + template + class unfolded_join_node<5,key_matching_port,OutputTuple,key_matching > : public + join_base<5,key_matching_port,OutputTuple,key_matching >::type { + typedef typename std::tuple_element<0, OutputTuple>::type T0; + typedef typename std::tuple_element<1, OutputTuple>::type T1; + typedef typename std::tuple_element<2, OutputTuple>::type T2; + typedef typename std::tuple_element<3, OutputTuple>::type T3; + typedef typename std::tuple_element<4, OutputTuple>::type T4; + public: + typedef typename wrap_key_tuple_elements<5,key_matching_port,key_matching,OutputTuple>::type input_ports_type; + typedef OutputTuple output_type; + private: + typedef join_node_base , input_ports_type, output_type > base_type; + typedef type_to_key_function_body *f0_p; + typedef type_to_key_function_body *f1_p; + typedef type_to_key_function_body *f2_p; + typedef type_to_key_function_body *f3_p; + typedef type_to_key_function_body *f4_p; + typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p > func_initializer_type; + public: +#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING + unfolded_join_node(graph &g) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()) + ) ) { + } +#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ + template + unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf(body0), + new type_to_key_function_body_leaf(body1), + new type_to_key_function_body_leaf(body2), + new type_to_key_function_body_leaf(body3), + new type_to_key_function_body_leaf(body4) + ) ) { + static_assert(std::tuple_size::value == 5, "wrong number of body initializers"); + } + unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} + }; + +#if __TBB_VARIADIC_MAX >= 6 + template + class unfolded_join_node<6,key_matching_port,OutputTuple,key_matching > : public + join_base<6,key_matching_port,OutputTuple,key_matching >::type { + typedef typename std::tuple_element<0, OutputTuple>::type T0; + typedef typename std::tuple_element<1, OutputTuple>::type T1; + typedef typename std::tuple_element<2, OutputTuple>::type T2; + typedef typename std::tuple_element<3, OutputTuple>::type T3; + typedef typename std::tuple_element<4, OutputTuple>::type T4; + typedef typename std::tuple_element<5, OutputTuple>::type T5; + public: + typedef typename wrap_key_tuple_elements<6,key_matching_port,key_matching,OutputTuple>::type input_ports_type; + typedef OutputTuple output_type; + private: + typedef join_node_base , input_ports_type, output_type > base_type; + typedef type_to_key_function_body *f0_p; + typedef type_to_key_function_body *f1_p; + typedef type_to_key_function_body *f2_p; + typedef type_to_key_function_body *f3_p; + typedef type_to_key_function_body *f4_p; + typedef type_to_key_function_body *f5_p; + typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p > func_initializer_type; + public: +#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING + unfolded_join_node(graph &g) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()) + ) ) { + } +#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ + template + unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4, Body5 body5) + : base_type(g, func_initializer_type( + new type_to_key_function_body_leaf(body0), + new type_to_key_function_body_leaf(body1), + new type_to_key_function_body_leaf(body2), + new type_to_key_function_body_leaf(body3), + new type_to_key_function_body_leaf(body4), + new type_to_key_function_body_leaf(body5) + ) ) { + static_assert(std::tuple_size::value == 6, "wrong number of body initializers"); + } + unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} + }; +#endif + +#if __TBB_VARIADIC_MAX >= 7 + template + class unfolded_join_node<7,key_matching_port,OutputTuple,key_matching > : public + join_base<7,key_matching_port,OutputTuple,key_matching >::type { + typedef typename std::tuple_element<0, OutputTuple>::type T0; + typedef typename std::tuple_element<1, OutputTuple>::type T1; + typedef typename std::tuple_element<2, OutputTuple>::type T2; + typedef typename std::tuple_element<3, OutputTuple>::type T3; + typedef typename std::tuple_element<4, OutputTuple>::type T4; + typedef typename std::tuple_element<5, OutputTuple>::type T5; + typedef typename std::tuple_element<6, OutputTuple>::type T6; + public: + typedef typename wrap_key_tuple_elements<7,key_matching_port,key_matching,OutputTuple>::type input_ports_type; + typedef OutputTuple output_type; + private: + typedef join_node_base , input_ports_type, output_type > base_type; + typedef type_to_key_function_body *f0_p; + typedef type_to_key_function_body *f1_p; + typedef type_to_key_function_body *f2_p; + typedef type_to_key_function_body *f3_p; + typedef type_to_key_function_body *f4_p; + typedef type_to_key_function_body *f5_p; + typedef type_to_key_function_body *f6_p; + typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p > func_initializer_type; + public: +#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING + unfolded_join_node(graph &g) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()) + ) ) { + } +#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ + template + unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4, + Body5 body5, Body6 body6) : base_type(g, func_initializer_type( + new type_to_key_function_body_leaf(body0), + new type_to_key_function_body_leaf(body1), + new type_to_key_function_body_leaf(body2), + new type_to_key_function_body_leaf(body3), + new type_to_key_function_body_leaf(body4), + new type_to_key_function_body_leaf(body5), + new type_to_key_function_body_leaf(body6) + ) ) { + static_assert(std::tuple_size::value == 7, "wrong number of body initializers"); + } + unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} + }; +#endif + +#if __TBB_VARIADIC_MAX >= 8 + template + class unfolded_join_node<8,key_matching_port,OutputTuple,key_matching > : public + join_base<8,key_matching_port,OutputTuple,key_matching >::type { + typedef typename std::tuple_element<0, OutputTuple>::type T0; + typedef typename std::tuple_element<1, OutputTuple>::type T1; + typedef typename std::tuple_element<2, OutputTuple>::type T2; + typedef typename std::tuple_element<3, OutputTuple>::type T3; + typedef typename std::tuple_element<4, OutputTuple>::type T4; + typedef typename std::tuple_element<5, OutputTuple>::type T5; + typedef typename std::tuple_element<6, OutputTuple>::type T6; + typedef typename std::tuple_element<7, OutputTuple>::type T7; + public: + typedef typename wrap_key_tuple_elements<8,key_matching_port,key_matching,OutputTuple>::type input_ports_type; + typedef OutputTuple output_type; + private: + typedef join_node_base , input_ports_type, output_type > base_type; + typedef type_to_key_function_body *f0_p; + typedef type_to_key_function_body *f1_p; + typedef type_to_key_function_body *f2_p; + typedef type_to_key_function_body *f3_p; + typedef type_to_key_function_body *f4_p; + typedef type_to_key_function_body *f5_p; + typedef type_to_key_function_body *f6_p; + typedef type_to_key_function_body *f7_p; + typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p > func_initializer_type; + public: +#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING + unfolded_join_node(graph &g) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()) + ) ) { + } +#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ + template + unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4, + Body5 body5, Body6 body6, Body7 body7) : base_type(g, func_initializer_type( + new type_to_key_function_body_leaf(body0), + new type_to_key_function_body_leaf(body1), + new type_to_key_function_body_leaf(body2), + new type_to_key_function_body_leaf(body3), + new type_to_key_function_body_leaf(body4), + new type_to_key_function_body_leaf(body5), + new type_to_key_function_body_leaf(body6), + new type_to_key_function_body_leaf(body7) + ) ) { + static_assert(std::tuple_size::value == 8, "wrong number of body initializers"); + } + unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} + }; +#endif + +#if __TBB_VARIADIC_MAX >= 9 + template + class unfolded_join_node<9,key_matching_port,OutputTuple,key_matching > : public + join_base<9,key_matching_port,OutputTuple,key_matching >::type { + typedef typename std::tuple_element<0, OutputTuple>::type T0; + typedef typename std::tuple_element<1, OutputTuple>::type T1; + typedef typename std::tuple_element<2, OutputTuple>::type T2; + typedef typename std::tuple_element<3, OutputTuple>::type T3; + typedef typename std::tuple_element<4, OutputTuple>::type T4; + typedef typename std::tuple_element<5, OutputTuple>::type T5; + typedef typename std::tuple_element<6, OutputTuple>::type T6; + typedef typename std::tuple_element<7, OutputTuple>::type T7; + typedef typename std::tuple_element<8, OutputTuple>::type T8; + public: + typedef typename wrap_key_tuple_elements<9,key_matching_port,key_matching,OutputTuple>::type input_ports_type; + typedef OutputTuple output_type; + private: + typedef join_node_base , input_ports_type, output_type > base_type; + typedef type_to_key_function_body *f0_p; + typedef type_to_key_function_body *f1_p; + typedef type_to_key_function_body *f2_p; + typedef type_to_key_function_body *f3_p; + typedef type_to_key_function_body *f4_p; + typedef type_to_key_function_body *f5_p; + typedef type_to_key_function_body *f6_p; + typedef type_to_key_function_body *f7_p; + typedef type_to_key_function_body *f8_p; + typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p, f8_p > func_initializer_type; + public: +#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING + unfolded_join_node(graph &g) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()) + ) ) { + } +#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ + template + unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4, + Body5 body5, Body6 body6, Body7 body7, Body8 body8) : base_type(g, func_initializer_type( + new type_to_key_function_body_leaf(body0), + new type_to_key_function_body_leaf(body1), + new type_to_key_function_body_leaf(body2), + new type_to_key_function_body_leaf(body3), + new type_to_key_function_body_leaf(body4), + new type_to_key_function_body_leaf(body5), + new type_to_key_function_body_leaf(body6), + new type_to_key_function_body_leaf(body7), + new type_to_key_function_body_leaf(body8) + ) ) { + static_assert(std::tuple_size::value == 9, "wrong number of body initializers"); + } + unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} + }; +#endif + +#if __TBB_VARIADIC_MAX >= 10 + template + class unfolded_join_node<10,key_matching_port,OutputTuple,key_matching > : public + join_base<10,key_matching_port,OutputTuple,key_matching >::type { + typedef typename std::tuple_element<0, OutputTuple>::type T0; + typedef typename std::tuple_element<1, OutputTuple>::type T1; + typedef typename std::tuple_element<2, OutputTuple>::type T2; + typedef typename std::tuple_element<3, OutputTuple>::type T3; + typedef typename std::tuple_element<4, OutputTuple>::type T4; + typedef typename std::tuple_element<5, OutputTuple>::type T5; + typedef typename std::tuple_element<6, OutputTuple>::type T6; + typedef typename std::tuple_element<7, OutputTuple>::type T7; + typedef typename std::tuple_element<8, OutputTuple>::type T8; + typedef typename std::tuple_element<9, OutputTuple>::type T9; + public: + typedef typename wrap_key_tuple_elements<10,key_matching_port,key_matching,OutputTuple>::type input_ports_type; + typedef OutputTuple output_type; + private: + typedef join_node_base , input_ports_type, output_type > base_type; + typedef type_to_key_function_body *f0_p; + typedef type_to_key_function_body *f1_p; + typedef type_to_key_function_body *f2_p; + typedef type_to_key_function_body *f3_p; + typedef type_to_key_function_body *f4_p; + typedef type_to_key_function_body *f5_p; + typedef type_to_key_function_body *f6_p; + typedef type_to_key_function_body *f7_p; + typedef type_to_key_function_body *f8_p; + typedef type_to_key_function_body *f9_p; + typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p, f8_p, f9_p > func_initializer_type; + public: +#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING + unfolded_join_node(graph &g) : base_type(g, + func_initializer_type( + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()), + new type_to_key_function_body_leaf >(key_from_message_body()) + ) ) { + } +#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ + template + unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4, + Body5 body5, Body6 body6, Body7 body7, Body8 body8, Body9 body9) : base_type(g, func_initializer_type( + new type_to_key_function_body_leaf(body0), + new type_to_key_function_body_leaf(body1), + new type_to_key_function_body_leaf(body2), + new type_to_key_function_body_leaf(body3), + new type_to_key_function_body_leaf(body4), + new type_to_key_function_body_leaf(body5), + new type_to_key_function_body_leaf(body6), + new type_to_key_function_body_leaf(body7), + new type_to_key_function_body_leaf(body8), + new type_to_key_function_body_leaf(body9) + ) ) { + static_assert(std::tuple_size::value == 10, "wrong number of body initializers"); + } + unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} + }; +#endif + + //! templated function to refer to input ports of the join node + template + typename std::tuple_element::type &input_port(JNT &jn) { + return std::get(jn.input_ports()); + } + +#endif // __TBB__flow_graph_join_impl_H diff --git a/third_party/tbb/detail/_flow_graph_node_impl.h b/third_party/tbb/detail/_flow_graph_node_impl.h new file mode 100644 index 000000000..20a3741d0 --- /dev/null +++ b/third_party/tbb/detail/_flow_graph_node_impl.h @@ -0,0 +1,775 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB__flow_graph_node_impl_H +#define __TBB__flow_graph_node_impl_H + +#ifndef __TBB_flow_graph_H +#error Do not #include this internal file directly; use public TBB headers instead. +#endif + +#include "third_party/tbb/detail/_flow_graph_item_buffer_impl.h" + +template< typename T, typename A > +class function_input_queue : public item_buffer { +public: + bool empty() const { + return this->buffer_empty(); + } + + const T& front() const { + return this->item_buffer::front(); + } + + void pop() { + this->destroy_front(); + } + + bool push( T& t ) { + return this->push_back( t ); + } +}; + +//! Input and scheduling for a function node that takes a type Input as input +// The only up-ref is apply_body_impl, which should implement the function +// call and any handling of the result. +template< typename Input, typename Policy, typename A, typename ImplType > +class function_input_base : public receiver, no_assign { + enum op_type {reg_pred, rem_pred, try_fwd, tryput_bypass, app_body_bypass, occupy_concurrency + }; + typedef function_input_base class_type; + +public: + + //! The input type of this receiver + typedef Input input_type; + typedef typename receiver::predecessor_type predecessor_type; + typedef predecessor_cache predecessor_cache_type; + typedef function_input_queue input_queue_type; + typedef typename allocator_traits::template rebind_alloc allocator_type; + static_assert(!has_policy::value || !has_policy::value, ""); + + //! Constructor for function_input_base + function_input_base( graph &g, size_t max_concurrency, node_priority_t a_priority, bool is_no_throw ) + : my_graph_ref(g), my_max_concurrency(max_concurrency) + , my_concurrency(0), my_priority(a_priority), my_is_no_throw(is_no_throw) + , my_queue(!has_policy::value ? new input_queue_type() : nullptr) + , my_predecessors(this) + , forwarder_busy(false) + { + my_aggregator.initialize_handler(handler_type(this)); + } + + //! Copy constructor + function_input_base( const function_input_base& src ) + : function_input_base(src.my_graph_ref, src.my_max_concurrency, src.my_priority, src.my_is_no_throw) {} + + //! Destructor + // The queue is allocated by the constructor for {multi}function_node. + // TODO: pass the graph_buffer_policy to the base so it can allocate the queue instead. + // This would be an interface-breaking change. + virtual ~function_input_base() { + delete my_queue; + my_queue = nullptr; + } + + graph_task* try_put_task( const input_type& t) override { + if ( my_is_no_throw ) + return try_put_task_impl(t, has_policy()); + else + return try_put_task_impl(t, std::false_type()); + } + + //! Adds src to the list of cached predecessors. + bool register_predecessor( predecessor_type &src ) override { + operation_type op_data(reg_pred); + op_data.r = &src; + my_aggregator.execute(&op_data); + return true; + } + + //! Removes src from the list of cached predecessors. + bool remove_predecessor( predecessor_type &src ) override { + operation_type op_data(rem_pred); + op_data.r = &src; + my_aggregator.execute(&op_data); + return true; + } + +protected: + + void reset_function_input_base( reset_flags f) { + my_concurrency = 0; + if(my_queue) { + my_queue->reset(); + } + reset_receiver(f); + forwarder_busy = false; + } + + graph& my_graph_ref; + const size_t my_max_concurrency; + size_t my_concurrency; + node_priority_t my_priority; + const bool my_is_no_throw; + input_queue_type *my_queue; + predecessor_cache my_predecessors; + + void reset_receiver( reset_flags f) { + if( f & rf_clear_edges) my_predecessors.clear(); + else + my_predecessors.reset(); + __TBB_ASSERT(!(f & rf_clear_edges) || my_predecessors.empty(), "function_input_base reset failed"); + } + + graph& graph_reference() const override { + return my_graph_ref; + } + + graph_task* try_get_postponed_task(const input_type& i) { + operation_type op_data(i, app_body_bypass); // tries to pop an item or get_item + my_aggregator.execute(&op_data); + return op_data.bypass_t; + } + +private: + + friend class apply_body_task_bypass< class_type, input_type >; + friend class forward_task_bypass< class_type >; + + class operation_type : public aggregated_operation< operation_type > { + public: + char type; + union { + input_type *elem; + predecessor_type *r; + }; + graph_task* bypass_t; + operation_type(const input_type& e, op_type t) : + type(char(t)), elem(const_cast(&e)), bypass_t(nullptr) {} + operation_type(op_type t) : type(char(t)), r(nullptr), bypass_t(nullptr) {} + }; + + bool forwarder_busy; + typedef aggregating_functor handler_type; + friend class aggregating_functor; + aggregator< handler_type, operation_type > my_aggregator; + + graph_task* perform_queued_requests() { + graph_task* new_task = nullptr; + if(my_queue) { + if(!my_queue->empty()) { + ++my_concurrency; + new_task = create_body_task(my_queue->front()); + + my_queue->pop(); + } + } + else { + input_type i; + if(my_predecessors.get_item(i)) { + ++my_concurrency; + new_task = create_body_task(i); + } + } + return new_task; + } + void handle_operations(operation_type *op_list) { + operation_type* tmp; + while (op_list) { + tmp = op_list; + op_list = op_list->next; + switch (tmp->type) { + case reg_pred: + my_predecessors.add(*(tmp->r)); + tmp->status.store(SUCCEEDED, std::memory_order_release); + if (!forwarder_busy) { + forwarder_busy = true; + spawn_forward_task(); + } + break; + case rem_pred: + my_predecessors.remove(*(tmp->r)); + tmp->status.store(SUCCEEDED, std::memory_order_release); + break; + case app_body_bypass: { + tmp->bypass_t = nullptr; + __TBB_ASSERT(my_max_concurrency != 0, nullptr); + --my_concurrency; + if(my_concurrencybypass_t = perform_queued_requests(); + tmp->status.store(SUCCEEDED, std::memory_order_release); + } + break; + case tryput_bypass: internal_try_put_task(tmp); break; + case try_fwd: internal_forward(tmp); break; + case occupy_concurrency: + if (my_concurrency < my_max_concurrency) { + ++my_concurrency; + tmp->status.store(SUCCEEDED, std::memory_order_release); + } else { + tmp->status.store(FAILED, std::memory_order_release); + } + break; + } + } + } + + //! Put to the node, but return the task instead of enqueueing it + void internal_try_put_task(operation_type *op) { + __TBB_ASSERT(my_max_concurrency != 0, nullptr); + if (my_concurrency < my_max_concurrency) { + ++my_concurrency; + graph_task * new_task = create_body_task(*(op->elem)); + op->bypass_t = new_task; + op->status.store(SUCCEEDED, std::memory_order_release); + } else if ( my_queue && my_queue->push(*(op->elem)) ) { + op->bypass_t = SUCCESSFULLY_ENQUEUED; + op->status.store(SUCCEEDED, std::memory_order_release); + } else { + op->bypass_t = nullptr; + op->status.store(FAILED, std::memory_order_release); + } + } + + //! Creates tasks for postponed messages if available and if concurrency allows + void internal_forward(operation_type *op) { + op->bypass_t = nullptr; + if (my_concurrency < my_max_concurrency) + op->bypass_t = perform_queued_requests(); + if(op->bypass_t) + op->status.store(SUCCEEDED, std::memory_order_release); + else { + forwarder_busy = false; + op->status.store(FAILED, std::memory_order_release); + } + } + + graph_task* internal_try_put_bypass( const input_type& t ) { + operation_type op_data(t, tryput_bypass); + my_aggregator.execute(&op_data); + if( op_data.status == SUCCEEDED ) { + return op_data.bypass_t; + } + return nullptr; + } + + graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::true_type ) { + if( my_max_concurrency == 0 ) { + return apply_body_bypass(t); + } else { + operation_type check_op(t, occupy_concurrency); + my_aggregator.execute(&check_op); + if( check_op.status == SUCCEEDED ) { + return apply_body_bypass(t); + } + return internal_try_put_bypass(t); + } + } + + graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::false_type ) { + if( my_max_concurrency == 0 ) { + return create_body_task(t); + } else { + return internal_try_put_bypass(t); + } + } + + //! Applies the body to the provided input + // then decides if more work is available + graph_task* apply_body_bypass( const input_type &i ) { + return static_cast(this)->apply_body_impl_bypass(i); + } + + //! allocates a task to apply a body + graph_task* create_body_task( const input_type &input ) { + if (!is_graph_active(my_graph_ref)) { + return nullptr; + } + // TODO revamp: extract helper for common graph task allocation part + small_object_allocator allocator{}; + typedef apply_body_task_bypass task_type; + graph_task* t = allocator.new_object( my_graph_ref, allocator, *this, input, my_priority ); + graph_reference().reserve_wait(); + return t; + } + + //! This is executed by an enqueued task, the "forwarder" + graph_task* forward_task() { + operation_type op_data(try_fwd); + graph_task* rval = nullptr; + do { + op_data.status = WAIT; + my_aggregator.execute(&op_data); + if(op_data.status == SUCCEEDED) { + graph_task* ttask = op_data.bypass_t; + __TBB_ASSERT( ttask && ttask != SUCCESSFULLY_ENQUEUED, nullptr); + rval = combine_tasks(my_graph_ref, rval, ttask); + } + } while (op_data.status == SUCCEEDED); + return rval; + } + + inline graph_task* create_forward_task() { + if (!is_graph_active(my_graph_ref)) { + return nullptr; + } + small_object_allocator allocator{}; + typedef forward_task_bypass task_type; + graph_task* t = allocator.new_object( graph_reference(), allocator, *this, my_priority ); + graph_reference().reserve_wait(); + return t; + } + + //! Spawns a task that calls forward() + inline void spawn_forward_task() { + graph_task* tp = create_forward_task(); + if(tp) { + spawn_in_graph_arena(graph_reference(), *tp); + } + } + + node_priority_t priority() const override { return my_priority; } +}; // function_input_base + +//! Implements methods for a function node that takes a type Input as input and sends +// a type Output to its successors. +template< typename Input, typename Output, typename Policy, typename A> +class function_input : public function_input_base > { +public: + typedef Input input_type; + typedef Output output_type; + typedef function_body function_body_type; + typedef function_input my_class; + typedef function_input_base base_type; + typedef function_input_queue input_queue_type; + + // constructor + template + function_input( + graph &g, size_t max_concurrency, Body& body, node_priority_t a_priority ) + : base_type(g, max_concurrency, a_priority, noexcept(tbb::detail::invoke(body, input_type()))) + , my_body( new function_body_leaf< input_type, output_type, Body>(body) ) + , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) ) { + } + + //! Copy constructor + function_input( const function_input& src ) : + base_type(src), + my_body( src.my_init_body->clone() ), + my_init_body(src.my_init_body->clone() ) { + } +#if __INTEL_COMPILER <= 2021 + // Suppress superfluous diagnostic about virtual keyword absence in a destructor of an inherited + // class while the parent class has the virtual keyword for the destrocutor. + virtual +#endif + ~function_input() { + delete my_body; + delete my_init_body; + } + + template< typename Body > + Body copy_function_object() { + function_body_type &body_ref = *this->my_body; + return dynamic_cast< function_body_leaf & >(body_ref).get_body(); + } + + output_type apply_body_impl( const input_type& i) { + // There is an extra copied needed to capture the + // body execution without the try_put + fgt_begin_body( my_body ); + output_type v = tbb::detail::invoke(*my_body, i); + fgt_end_body( my_body ); + return v; + } + + //TODO: consider moving into the base class + graph_task* apply_body_impl_bypass( const input_type &i) { + output_type v = apply_body_impl(i); + graph_task* postponed_task = nullptr; + if( base_type::my_max_concurrency != 0 ) { + postponed_task = base_type::try_get_postponed_task(i); + __TBB_ASSERT( !postponed_task || postponed_task != SUCCESSFULLY_ENQUEUED, nullptr); + } + if( postponed_task ) { + // make the task available for other workers since we do not know successors' + // execution policy + spawn_in_graph_arena(base_type::graph_reference(), *postponed_task); + } + graph_task* successor_task = successors().try_put_task(v); +#if _MSC_VER && !__INTEL_COMPILER +#pragma warning (push) +#pragma warning (disable: 4127) /* suppress conditional expression is constant */ +#endif + if(has_policy::value) { +#if _MSC_VER && !__INTEL_COMPILER +#pragma warning (pop) +#endif + if(!successor_task) { + // Return confirmative status since current + // node's body has been executed anyway + successor_task = SUCCESSFULLY_ENQUEUED; + } + } + return successor_task; + } + +protected: + + void reset_function_input(reset_flags f) { + base_type::reset_function_input_base(f); + if(f & rf_reset_bodies) { + function_body_type *tmp = my_init_body->clone(); + delete my_body; + my_body = tmp; + } + } + + function_body_type *my_body; + function_body_type *my_init_body; + virtual broadcast_cache &successors() = 0; + +}; // function_input + + +// helper templates to clear the successor edges of the output ports of an multifunction_node +template struct clear_element { + template static void clear_this(P &p) { + (void)std::get(p).successors().clear(); + clear_element::clear_this(p); + } +#if TBB_USE_ASSERT + template static bool this_empty(P &p) { + if(std::get(p).successors().empty()) + return clear_element::this_empty(p); + return false; + } +#endif +}; + +template<> struct clear_element<1> { + template static void clear_this(P &p) { + (void)std::get<0>(p).successors().clear(); + } +#if TBB_USE_ASSERT + template static bool this_empty(P &p) { + return std::get<0>(p).successors().empty(); + } +#endif +}; + +template +struct init_output_ports { + template + static OutputTuple call(graph& g, const std::tuple&) { + return OutputTuple(Args(g)...); + } +}; // struct init_output_ports + +//! Implements methods for a function node that takes a type Input as input +// and has a tuple of output ports specified. +template< typename Input, typename OutputPortSet, typename Policy, typename A> +class multifunction_input : public function_input_base > { +public: + static const int N = std::tuple_size::value; + typedef Input input_type; + typedef OutputPortSet output_ports_type; + typedef multifunction_body multifunction_body_type; + typedef multifunction_input my_class; + typedef function_input_base base_type; + typedef function_input_queue input_queue_type; + + // constructor + template + multifunction_input(graph &g, size_t max_concurrency,Body& body, node_priority_t a_priority ) + : base_type(g, max_concurrency, a_priority, noexcept(tbb::detail::invoke(body, input_type(), my_output_ports))) + , my_body( new multifunction_body_leaf(body) ) + , my_init_body( new multifunction_body_leaf(body) ) + , my_output_ports(init_output_ports::call(g, my_output_ports)){ + } + + //! Copy constructor + multifunction_input( const multifunction_input& src ) : + base_type(src), + my_body( src.my_init_body->clone() ), + my_init_body(src.my_init_body->clone() ), + my_output_ports( init_output_ports::call(src.my_graph_ref, my_output_ports) ) { + } + + ~multifunction_input() { + delete my_body; + delete my_init_body; + } + + template< typename Body > + Body copy_function_object() { + multifunction_body_type &body_ref = *this->my_body; + return *static_cast(dynamic_cast< multifunction_body_leaf & >(body_ref).get_body_ptr()); + } + + // for multifunction nodes we do not have a single successor as such. So we just tell + // the task we were successful. + //TODO: consider moving common parts with implementation in function_input into separate function + graph_task* apply_body_impl_bypass( const input_type &i ) { + fgt_begin_body( my_body ); + (*my_body)(i, my_output_ports); + fgt_end_body( my_body ); + graph_task* ttask = nullptr; + if(base_type::my_max_concurrency != 0) { + ttask = base_type::try_get_postponed_task(i); + } + return ttask ? ttask : SUCCESSFULLY_ENQUEUED; + } + + output_ports_type &output_ports(){ return my_output_ports; } + +protected: + + void reset(reset_flags f) { + base_type::reset_function_input_base(f); + if(f & rf_clear_edges)clear_element::clear_this(my_output_ports); + if(f & rf_reset_bodies) { + multifunction_body_type* tmp = my_init_body->clone(); + delete my_body; + my_body = tmp; + } + __TBB_ASSERT(!(f & rf_clear_edges) || clear_element::this_empty(my_output_ports), "multifunction_node reset failed"); + } + + multifunction_body_type *my_body; + multifunction_body_type *my_init_body; + output_ports_type my_output_ports; + +}; // multifunction_input + +// template to refer to an output port of a multifunction_node +template +typename std::tuple_element::type &output_port(MOP &op) { + return std::get(op.output_ports()); +} + +inline void check_task_and_spawn(graph& g, graph_task* t) { + if (t && t != SUCCESSFULLY_ENQUEUED) { + spawn_in_graph_arena(g, *t); + } +} + +// helper structs for split_node +template +struct emit_element { + template + static graph_task* emit_this(graph& g, const T &t, P &p) { + // TODO: consider to collect all the tasks in task_list and spawn them all at once + graph_task* last_task = std::get(p).try_put_task(std::get(t)); + check_task_and_spawn(g, last_task); + return emit_element::emit_this(g,t,p); + } +}; + +template<> +struct emit_element<1> { + template + static graph_task* emit_this(graph& g, const T &t, P &p) { + graph_task* last_task = std::get<0>(p).try_put_task(std::get<0>(t)); + check_task_and_spawn(g, last_task); + return SUCCESSFULLY_ENQUEUED; + } +}; + +//! Implements methods for an executable node that takes continue_msg as input +template< typename Output, typename Policy> +class continue_input : public continue_receiver { +public: + + //! The input type of this receiver + typedef continue_msg input_type; + + //! The output type of this receiver + typedef Output output_type; + typedef function_body function_body_type; + typedef continue_input class_type; + + template< typename Body > + continue_input( graph &g, Body& body, node_priority_t a_priority ) + : continue_receiver(/*number_of_predecessors=*/0, a_priority) + , my_graph_ref(g) + , my_body( new function_body_leaf< input_type, output_type, Body>(body) ) + , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) ) + { } + + template< typename Body > + continue_input( graph &g, int number_of_predecessors, + Body& body, node_priority_t a_priority ) + : continue_receiver( number_of_predecessors, a_priority ) + , my_graph_ref(g) + , my_body( new function_body_leaf< input_type, output_type, Body>(body) ) + , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) ) + { } + + continue_input( const continue_input& src ) : continue_receiver(src), + my_graph_ref(src.my_graph_ref), + my_body( src.my_init_body->clone() ), + my_init_body( src.my_init_body->clone() ) {} + + ~continue_input() { + delete my_body; + delete my_init_body; + } + + template< typename Body > + Body copy_function_object() { + function_body_type &body_ref = *my_body; + return dynamic_cast< function_body_leaf & >(body_ref).get_body(); + } + + void reset_receiver( reset_flags f) override { + continue_receiver::reset_receiver(f); + if(f & rf_reset_bodies) { + function_body_type *tmp = my_init_body->clone(); + delete my_body; + my_body = tmp; + } + } + +protected: + + graph& my_graph_ref; + function_body_type *my_body; + function_body_type *my_init_body; + + virtual broadcast_cache &successors() = 0; + + friend class apply_body_task_bypass< class_type, continue_msg >; + + //! Applies the body to the provided input + graph_task* apply_body_bypass( input_type ) { + // There is an extra copied needed to capture the + // body execution without the try_put + fgt_begin_body( my_body ); + output_type v = (*my_body)( continue_msg() ); + fgt_end_body( my_body ); + return successors().try_put_task( v ); + } + + graph_task* execute() override { + if(!is_graph_active(my_graph_ref)) { + return nullptr; + } +#if _MSC_VER && !__INTEL_COMPILER +#pragma warning (push) +#pragma warning (disable: 4127) /* suppress conditional expression is constant */ +#endif + if(has_policy::value) { +#if _MSC_VER && !__INTEL_COMPILER +#pragma warning (pop) +#endif + return apply_body_bypass( continue_msg() ); + } + else { + small_object_allocator allocator{}; + typedef apply_body_task_bypass task_type; + graph_task* t = allocator.new_object( graph_reference(), allocator, *this, continue_msg(), my_priority ); + graph_reference().reserve_wait(); + return t; + } + } + + graph& graph_reference() const override { + return my_graph_ref; + } +}; // continue_input + +//! Implements methods for both executable and function nodes that puts Output to its successors +template< typename Output > +class function_output : public sender { +public: + + template friend struct clear_element; + typedef Output output_type; + typedef typename sender::successor_type successor_type; + typedef broadcast_cache broadcast_cache_type; + + function_output(graph& g) : my_successors(this), my_graph_ref(g) {} + function_output(const function_output& other) = delete; + + //! Adds a new successor to this node + bool register_successor( successor_type &r ) override { + successors().register_successor( r ); + return true; + } + + //! Removes a successor from this node + bool remove_successor( successor_type &r ) override { + successors().remove_successor( r ); + return true; + } + + broadcast_cache_type &successors() { return my_successors; } + + graph& graph_reference() const { return my_graph_ref; } +protected: + broadcast_cache_type my_successors; + graph& my_graph_ref; +}; // function_output + +template< typename Output > +class multifunction_output : public function_output { +public: + typedef Output output_type; + typedef function_output base_type; + using base_type::my_successors; + + multifunction_output(graph& g) : base_type(g) {} + multifunction_output(const multifunction_output& other) : base_type(other.my_graph_ref) {} + + bool try_put(const output_type &i) { + graph_task *res = try_put_task(i); + if( !res ) return false; + if( res != SUCCESSFULLY_ENQUEUED ) { + // wrapping in task_arena::execute() is not needed since the method is called from + // inside task::execute() + spawn_in_graph_arena(graph_reference(), *res); + } + return true; + } + + using base_type::graph_reference; + +protected: + + graph_task* try_put_task(const output_type &i) { + return my_successors.try_put_task(i); + } + + template friend struct emit_element; + +}; // multifunction_output + +//composite_node +template +void add_nodes_impl(CompositeType*, bool) {} + +template< typename CompositeType, typename NodeType1, typename... NodeTypes > +void add_nodes_impl(CompositeType *c_node, bool visible, const NodeType1& n1, const NodeTypes&... n) { + void *addr = const_cast(&n1); + + fgt_alias_port(c_node, addr, visible); + add_nodes_impl(c_node, visible, n...); +} + +#endif // __TBB__flow_graph_node_impl_H diff --git a/third_party/tbb/detail/_flow_graph_node_set_impl.h b/third_party/tbb/detail/_flow_graph_node_set_impl.h new file mode 100644 index 000000000..993e4fee7 --- /dev/null +++ b/third_party/tbb/detail/_flow_graph_node_set_impl.h @@ -0,0 +1,266 @@ +// clang-format off +/* + Copyright (c) 2020-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_flow_graph_node_set_impl_H +#define __TBB_flow_graph_node_set_impl_H + +#ifndef __TBB_flow_graph_H +#error Do not #include this internal file directly; use public TBB headers instead. +#endif + +// Included in namespace tbb::detail::d1 (in flow_graph.h) + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET +// Visual Studio 2019 reports an error while calling predecessor_selector::get and successor_selector::get +// Seems like the well-formed expression in trailing decltype is treated as ill-formed +// TODO: investigate problems with decltype in trailing return types or find the cross-platform solution +#define __TBB_MSVC_DISABLE_TRAILING_DECLTYPE (_MSC_VER >= 1900) + +namespace order { +struct undefined {}; +struct following {}; +struct preceding {}; +} + +class get_graph_helper { +public: + // TODO: consider making graph_reference() public and consistent interface to get a reference to the graph + // and remove get_graph_helper + template + static graph& get(const T& object) { + return get_impl(object, std::is_base_of()); + } + +private: + // Get graph from the object of type derived from graph_node + template + static graph& get_impl(const T& object, std::true_type) { + return static_cast(&object)->my_graph; + } + + template + static graph& get_impl(const T& object, std::false_type) { + return object.graph_reference(); + } +}; + +template +struct node_set { + typedef Order order_type; + + std::tuple nodes; + node_set(Nodes&... ns) : nodes(ns...) {} + + template + node_set(const node_set& set) : nodes(set.nodes) {} + + graph& graph_reference() const { + return get_graph_helper::get(std::get<0>(nodes)); + } +}; + +namespace alias_helpers { +template using output_type = typename T::output_type; +template using output_ports_type = typename T::output_ports_type; +template using input_type = typename T::input_type; +template using input_ports_type = typename T::input_ports_type; +} // namespace alias_helpers + +template +using has_output_type = supports; + +template +using has_input_type = supports; + +template +using has_input_ports_type = supports; + +template +using has_output_ports_type = supports; + +template +struct is_sender : std::is_base_of, T> {}; + +template +struct is_receiver : std::is_base_of, T> {}; + +template +struct is_async_node : std::false_type {}; + +template +struct is_async_node> : std::true_type {}; + +template +node_set +follows(FirstPredecessor& first_predecessor, Predecessors&... predecessors) { + static_assert((conjunction, + has_output_type...>::value), + "Not all node's predecessors has output_type typedef"); + static_assert((conjunction, is_sender...>::value), + "Not all node's predecessors are senders"); + return node_set(first_predecessor, predecessors...); +} + +template +node_set +follows(node_set& predecessors_set) { + static_assert((conjunction...>::value), + "Not all nodes in the set has output_type typedef"); + static_assert((conjunction...>::value), + "Not all nodes in the set are senders"); + return node_set(predecessors_set); +} + +template +node_set +precedes(FirstSuccessor& first_successor, Successors&... successors) { + static_assert((conjunction, + has_input_type...>::value), + "Not all node's successors has input_type typedef"); + static_assert((conjunction, is_receiver...>::value), + "Not all node's successors are receivers"); + return node_set(first_successor, successors...); +} + +template +node_set +precedes(node_set& successors_set) { + static_assert((conjunction...>::value), + "Not all nodes in the set has input_type typedef"); + static_assert((conjunction...>::value), + "Not all nodes in the set are receivers"); + return node_set(successors_set); +} + +template +node_set +make_node_set(Node& first_node, Nodes&... nodes) { + return node_set(first_node, nodes...); +} + +template +class successor_selector { + template + static auto get_impl(NodeType& node, std::true_type) -> decltype(input_port(node)) { + return input_port(node); + } + + template + static NodeType& get_impl(NodeType& node, std::false_type) { return node; } + +public: + template +#if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE + static auto& get(NodeType& node) +#else + static auto get(NodeType& node) -> decltype(get_impl(node, has_input_ports_type())) +#endif + { + return get_impl(node, has_input_ports_type()); + } +}; + +template +class predecessor_selector { + template + static auto internal_get(NodeType& node, std::true_type) -> decltype(output_port(node)) { + return output_port(node); + } + + template + static NodeType& internal_get(NodeType& node, std::false_type) { return node;} + + template +#if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE + static auto& get_impl(NodeType& node, std::false_type) +#else + static auto get_impl(NodeType& node, std::false_type) -> decltype(internal_get(node, has_output_ports_type())) +#endif + { + return internal_get(node, has_output_ports_type()); + } + + template + static AsyncNode& get_impl(AsyncNode& node, std::true_type) { return node; } + +public: + template +#if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE + static auto& get(NodeType& node) +#else + static auto get(NodeType& node) -> decltype(get_impl(node, is_async_node())) +#endif + { + return get_impl(node, is_async_node()); + } +}; + +template +class make_edges_helper { +public: + template + static void connect_predecessors(PredecessorsTuple& predecessors, NodeType& node) { + make_edge(std::get(predecessors), successor_selector::get(node)); + make_edges_helper::connect_predecessors(predecessors, node); + } + + template + static void connect_successors(NodeType& node, SuccessorsTuple& successors) { + make_edge(predecessor_selector::get(node), std::get(successors)); + make_edges_helper::connect_successors(node, successors); + } +}; + +template<> +struct make_edges_helper<0> { + template + static void connect_predecessors(PredecessorsTuple& predecessors, NodeType& node) { + make_edge(std::get<0>(predecessors), successor_selector<0>::get(node)); + } + + template + static void connect_successors(NodeType& node, SuccessorsTuple& successors) { + make_edge(predecessor_selector<0>::get(node), std::get<0>(successors)); + } +}; + +// TODO: consider adding an overload for making edges between node sets +template +void make_edges(const node_set& s, NodeType& node) { + const std::size_t SetSize = std::tuple_size::value; + make_edges_helper::connect_predecessors(s.nodes, node); +} + +template +void make_edges(NodeType& node, const node_set& s) { + const std::size_t SetSize = std::tuple_size::value; + make_edges_helper::connect_successors(node, s.nodes); +} + +template +void make_edges_in_order(const node_set& ns, NodeType& node) { + make_edges(ns, node); +} + +template +void make_edges_in_order(const node_set& ns, NodeType& node) { + make_edges(node, ns); +} + +#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + +#endif // __TBB_flow_graph_node_set_impl_H diff --git a/third_party/tbb/detail/_flow_graph_nodes_deduction.h b/third_party/tbb/detail/_flow_graph_nodes_deduction.h new file mode 100644 index 000000000..4eaa7a8b4 --- /dev/null +++ b/third_party/tbb/detail/_flow_graph_nodes_deduction.h @@ -0,0 +1,278 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_flow_graph_nodes_deduction_H +#define __TBB_flow_graph_nodes_deduction_H + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +namespace tbb { +namespace detail { +namespace d1 { + +template +struct declare_body_types { + using input_type = Input; + using output_type = Output; +}; + +struct NoInputBody {}; + +template +struct declare_body_types { + using output_type = Output; +}; + +template struct body_types; + +template +struct body_types : declare_body_types {}; + +template +struct body_types : declare_body_types {}; + +template +struct body_types : declare_body_types {}; + +template +struct body_types : declare_body_types {}; + +template +struct body_types : declare_body_types {}; + +template +struct body_types : declare_body_types {}; + +template +struct body_types : declare_body_types {}; + +template +struct body_types : declare_body_types {}; + +template +struct body_types : declare_body_types {}; + +template +using input_t = typename body_types::input_type; + +template +using output_t = typename body_types::output_type; + +template +auto decide_on_operator_overload(Output (T::*name)(const Input&) const)->decltype(name); + +template +auto decide_on_operator_overload(Output (T::*name)(const Input&))->decltype(name); + +template +auto decide_on_operator_overload(Output (T::*name)(Input&) const)->decltype(name); + +template +auto decide_on_operator_overload(Output (T::*name)(Input&))->decltype(name); + +template +auto decide_on_operator_overload(Output (*name)(const Input&))->decltype(name); + +template +auto decide_on_operator_overload(Output (*name)(Input&))->decltype(name); + +template +decltype(decide_on_operator_overload(&Body::operator())) decide_on_callable_type(int); + +template +decltype(decide_on_operator_overload(std::declval())) decide_on_callable_type(...); + +// Deduction guides for Flow Graph nodes + +template +input_node(GraphOrSet&&, Body) +->input_node(0))>>; + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + +template +struct decide_on_set; + +template +struct decide_on_set> { + using type = typename Node::output_type; +}; + +template +struct decide_on_set> { + using type = typename Node::input_type; +}; + +template +using decide_on_set_t = typename decide_on_set>::type; + +template +broadcast_node(const NodeSet&) +->broadcast_node>; + +template +buffer_node(const NodeSet&) +->buffer_node>; + +template +queue_node(const NodeSet&) +->queue_node>; +#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + +template +sequencer_node(GraphOrProxy&&, Sequencer) +->sequencer_node(0))>>; + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET +template +priority_queue_node(const NodeSet&, const Compare&) +->priority_queue_node, Compare>; + +template +priority_queue_node(const NodeSet&) +->priority_queue_node, std::less>>; +#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + +template +struct join_key { + using type = Key; +}; + +template +struct join_key { + using type = T&; +}; + +template +using join_key_t = typename join_key::type; + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET +template +join_node(const node_set&, Policy) +->join_node, + Policy>; + +template +join_node(const node_set&, Policy) +->join_node; + +template +join_node(const node_set) +->join_node, + queueing>; + +template +join_node(const node_set) +->join_node; +#endif + +template +join_node(GraphOrProxy&&, Body, Bodies...) +->join_node(0))>, + input_t(0))>...>, + key_matching(0))>>>>; + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET +template +indexer_node(const node_set&) +->indexer_node; +#endif + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET +template +limiter_node(const NodeSet&, size_t) +->limiter_node>; + +template +split_node(const node_set&) +->split_node; + +template +split_node(const node_set&) +->split_node>; + +#endif + +template +function_node(GraphOrSet&&, + size_t, Body, + Policy, node_priority_t = no_priority) +->function_node(0))>, + output_t(0))>, + Policy>; + +template +function_node(GraphOrSet&&, size_t, + Body, node_priority_t = no_priority) +->function_node(0))>, + output_t(0))>, + queueing>; + +template +struct continue_output { + using type = Output; +}; + +template <> +struct continue_output { + using type = continue_msg; +}; + +template +using continue_output_t = typename continue_output::type; + +template +continue_node(GraphOrSet&&, Body, + Policy, node_priority_t = no_priority) +->continue_node>, + Policy>; + +template +continue_node(GraphOrSet&&, + int, Body, + Policy, node_priority_t = no_priority) +->continue_node>, + Policy>; + +template +continue_node(GraphOrSet&&, + Body, node_priority_t = no_priority) +->continue_node>, Policy>; + +template +continue_node(GraphOrSet&&, int, + Body, node_priority_t = no_priority) +->continue_node>, + Policy>; + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + +template +overwrite_node(const NodeSet&) +->overwrite_node>; + +template +write_once_node(const NodeSet&) +->write_once_node>; +#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +#endif // __TBB_flow_graph_nodes_deduction_H diff --git a/third_party/tbb/detail/_flow_graph_tagged_buffer_impl.h b/third_party/tbb/detail/_flow_graph_tagged_buffer_impl.h new file mode 100644 index 000000000..68ce59b96 --- /dev/null +++ b/third_party/tbb/detail/_flow_graph_tagged_buffer_impl.h @@ -0,0 +1,258 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// a hash table buffer that can expand, and can support as many deletions as +// additions, list-based, with elements of list held in array (for destruction +// management), multiplicative hashing (like ets). No synchronization built-in. +// + +#ifndef __TBB__flow_graph_hash_buffer_impl_H +#define __TBB__flow_graph_hash_buffer_impl_H + +#ifndef __TBB_flow_graph_H +#error Do not #include this internal file directly; use public TBB headers instead. +#endif + +// included in namespace tbb::flow::interfaceX::internal + +// elements in the table are a simple list; we need pointer to next element to +// traverse the chain +template +struct buffer_element_type { + // the second parameter below is void * because we can't forward-declare the type + // itself, so we just reinterpret_cast below. + typedef typename aligned_pair::type type; +}; + +template + < + typename Key, // type of key within ValueType + typename ValueType, + typename ValueToKey, // abstract method that returns "const Key" or "const Key&" given ValueType + typename HashCompare, // has hash and equal + typename Allocator=tbb::cache_aligned_allocator< typename aligned_pair::type > + > +class hash_buffer : public HashCompare { +public: + static const size_t INITIAL_SIZE = 8; // initial size of the hash pointer table + typedef ValueType value_type; + typedef typename buffer_element_type< value_type >::type element_type; + typedef value_type *pointer_type; + typedef element_type *list_array_type; // array we manage manually + typedef list_array_type *pointer_array_type; + typedef typename std::allocator_traits::template rebind_alloc pointer_array_allocator_type; + typedef typename std::allocator_traits::template rebind_alloc elements_array_allocator; + typedef typename std::decay::type Knoref; + +private: + ValueToKey *my_key; + size_t my_size; + size_t nelements; + pointer_array_type pointer_array; // pointer_array[my_size] + list_array_type elements_array; // elements_array[my_size / 2] + element_type* free_list; + + size_t mask() { return my_size - 1; } + + void set_up_free_list( element_type **p_free_list, list_array_type la, size_t sz) { + for(size_t i=0; i < sz - 1; ++i ) { // construct free list + la[i].second = &(la[i+1]); + } + la[sz-1].second = nullptr; + *p_free_list = (element_type *)&(la[0]); + } + + // cleanup for exceptions + struct DoCleanup { + pointer_array_type *my_pa; + list_array_type *my_elements; + size_t my_size; + + DoCleanup(pointer_array_type &pa, list_array_type &my_els, size_t sz) : + my_pa(&pa), my_elements(&my_els), my_size(sz) { } + ~DoCleanup() { + if(my_pa) { + size_t dont_care = 0; + internal_free_buffer(*my_pa, *my_elements, my_size, dont_care); + } + } + }; + + // exception-safety requires we do all the potentially-throwing operations first + void grow_array() { + size_t new_size = my_size*2; + size_t new_nelements = nelements; // internal_free_buffer zeroes this + list_array_type new_elements_array = nullptr; + pointer_array_type new_pointer_array = nullptr; + list_array_type new_free_list = nullptr; + { + DoCleanup my_cleanup(new_pointer_array, new_elements_array, new_size); + new_elements_array = elements_array_allocator().allocate(my_size); + new_pointer_array = pointer_array_allocator_type().allocate(new_size); + for(size_t i=0; i < new_size; ++i) new_pointer_array[i] = nullptr; + set_up_free_list(&new_free_list, new_elements_array, my_size ); + + for(size_t i=0; i < my_size; ++i) { + for( element_type* op = pointer_array[i]; op; op = (element_type *)(op->second)) { + value_type *ov = reinterpret_cast(&(op->first)); + // could have std::move semantics + internal_insert_with_key(new_pointer_array, new_size, new_free_list, *ov); + } + } + my_cleanup.my_pa = nullptr; + my_cleanup.my_elements = nullptr; + } + + internal_free_buffer(pointer_array, elements_array, my_size, nelements); + free_list = new_free_list; + pointer_array = new_pointer_array; + elements_array = new_elements_array; + my_size = new_size; + nelements = new_nelements; + } + + // v should have perfect forwarding if std::move implemented. + // we use this method to move elements in grow_array, so can't use class fields + void internal_insert_with_key( element_type **p_pointer_array, size_t p_sz, list_array_type &p_free_list, + const value_type &v) { + size_t l_mask = p_sz-1; + __TBB_ASSERT(my_key, "Error: value-to-key functor not provided"); + size_t h = this->hash(tbb::detail::invoke(*my_key, v)) & l_mask; + __TBB_ASSERT(p_free_list, "Error: free list not set up."); + element_type* my_elem = p_free_list; p_free_list = (element_type *)(p_free_list->second); + (void) new(&(my_elem->first)) value_type(v); + my_elem->second = p_pointer_array[h]; + p_pointer_array[h] = my_elem; + } + + void internal_initialize_buffer() { + pointer_array = pointer_array_allocator_type().allocate(my_size); + for(size_t i = 0; i < my_size; ++i) pointer_array[i] = nullptr; + elements_array = elements_array_allocator().allocate(my_size / 2); + set_up_free_list(&free_list, elements_array, my_size / 2); + } + + // made static so an enclosed class can use to properly dispose of the internals + static void internal_free_buffer( pointer_array_type &pa, list_array_type &el, size_t &sz, size_t &ne ) { + if(pa) { + for(size_t i = 0; i < sz; ++i ) { + element_type *p_next; + for( element_type *p = pa[i]; p; p = p_next) { + p_next = (element_type *)p->second; + // TODO revamp: make sure type casting is correct. + void* ptr = (void*)(p->first); +#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER + suppress_unused_warning(ptr); +#endif + ((value_type*)ptr)->~value_type(); + } + } + pointer_array_allocator_type().deallocate(pa, sz); + pa = nullptr; + } + // Separate test (if allocation of pa throws, el may be allocated. + // but no elements will be constructed.) + if(el) { + elements_array_allocator().deallocate(el, sz / 2); + el = nullptr; + } + sz = INITIAL_SIZE; + ne = 0; + } + +public: + hash_buffer() : my_key(nullptr), my_size(INITIAL_SIZE), nelements(0) { + internal_initialize_buffer(); + } + + ~hash_buffer() { + internal_free_buffer(pointer_array, elements_array, my_size, nelements); + delete my_key; + my_key = nullptr; + } + hash_buffer(const hash_buffer&) = delete; + hash_buffer& operator=(const hash_buffer&) = delete; + + void reset() { + internal_free_buffer(pointer_array, elements_array, my_size, nelements); + internal_initialize_buffer(); + } + + // Take ownership of func object allocated with new. + // This method is only used internally, so can't be misused by user. + void set_key_func(ValueToKey *vtk) { my_key = vtk; } + // pointer is used to clone() + ValueToKey* get_key_func() { return my_key; } + + bool insert_with_key(const value_type &v) { + pointer_type p = nullptr; + __TBB_ASSERT(my_key, "Error: value-to-key functor not provided"); + if(find_ref_with_key(tbb::detail::invoke(*my_key, v), p)) { + p->~value_type(); + (void) new(p) value_type(v); // copy-construct into the space + return false; + } + ++nelements; + if(nelements*2 > my_size) grow_array(); + internal_insert_with_key(pointer_array, my_size, free_list, v); + return true; + } + + // returns true and sets v to array element if found, else returns false. + bool find_ref_with_key(const Knoref& k, pointer_type &v) { + size_t i = this->hash(k) & mask(); + for(element_type* p = pointer_array[i]; p; p = (element_type *)(p->second)) { + pointer_type pv = reinterpret_cast(&(p->first)); + __TBB_ASSERT(my_key, "Error: value-to-key functor not provided"); + if(this->equal(tbb::detail::invoke(*my_key, *pv), k)) { + v = pv; + return true; + } + } + return false; + } + + bool find_with_key( const Knoref& k, value_type &v) { + value_type *p; + if(find_ref_with_key(k, p)) { + v = *p; + return true; + } + else + return false; + } + + void delete_with_key(const Knoref& k) { + size_t h = this->hash(k) & mask(); + element_type* prev = nullptr; + for(element_type* p = pointer_array[h]; p; prev = p, p = (element_type *)(p->second)) { + value_type *vp = reinterpret_cast(&(p->first)); + __TBB_ASSERT(my_key, "Error: value-to-key functor not provided"); + if(this->equal(tbb::detail::invoke(*my_key, *vp), k)) { + vp->~value_type(); + if(prev) prev->second = p->second; + else pointer_array[h] = (element_type *)(p->second); + p->second = free_list; + free_list = p; + --nelements; + return; + } + } + __TBB_ASSERT(false, "key not found for delete"); + } +}; +#endif // __TBB__flow_graph_hash_buffer_impl_H diff --git a/third_party/tbb/detail/_flow_graph_trace_impl.h b/third_party/tbb/detail/_flow_graph_trace_impl.h new file mode 100644 index 000000000..dc9c857be --- /dev/null +++ b/third_party/tbb/detail/_flow_graph_trace_impl.h @@ -0,0 +1,365 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _FGT_GRAPH_TRACE_IMPL_H +#define _FGT_GRAPH_TRACE_IMPL_H + +#include "third_party/tbb/profiling.h" +#if (_MSC_VER >= 1900) + // MISSING #include +#endif + +namespace tbb { +namespace detail { +namespace d1 { + +template< typename T > class sender; +template< typename T > class receiver; + +#if TBB_USE_PROFILING_TOOLS + #if __TBB_FLOW_TRACE_CODEPTR + #if (_MSC_VER >= 1900) + #define CODEPTR() (_ReturnAddress()) + #elif __TBB_GCC_VERSION >= 40800 + #define CODEPTR() ( __builtin_return_address(0)) + #else + #define CODEPTR() nullptr + #endif + #else + #define CODEPTR() nullptr + #endif /* __TBB_FLOW_TRACE_CODEPTR */ + +static inline void fgt_alias_port(void *node, void *p, bool visible) { + if(visible) + itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_NODE ); + else + itt_relation_add( ITT_DOMAIN_FLOW, p, FLOW_NODE, __itt_relation_is_child_of, node, FLOW_NODE ); +} + +static inline void fgt_composite ( void* codeptr, void *node, void *graph ) { + itt_make_task_group( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_COMPOSITE_NODE ); + suppress_unused_warning( codeptr ); +#if __TBB_FLOW_TRACE_CODEPTR + if (codeptr != nullptr) { + register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr); + } +#endif +} + +static inline void fgt_internal_alias_input_port( void *node, void *p, string_resource_index name_index ) { + itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index ); + itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_INPUT_PORT ); +} + +static inline void fgt_internal_alias_output_port( void *node, void *p, string_resource_index name_index ) { + itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index ); + itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_OUTPUT_PORT ); +} + +template +void alias_input_port(void *node, receiver* port, string_resource_index name_index) { + // TODO: Make fgt_internal_alias_input_port a function template? + fgt_internal_alias_input_port( node, port, name_index); +} + +template < typename PortsTuple, int N > +struct fgt_internal_input_alias_helper { + static void alias_port( void *node, PortsTuple &ports ) { + alias_input_port( node, &(std::get(ports)), static_cast(FLOW_INPUT_PORT_0 + N - 1) ); + fgt_internal_input_alias_helper::alias_port( node, ports ); + } +}; + +template < typename PortsTuple > +struct fgt_internal_input_alias_helper { + static void alias_port( void * /* node */, PortsTuple & /* ports */ ) { } +}; + +template +void alias_output_port(void *node, sender* port, string_resource_index name_index) { + // TODO: Make fgt_internal_alias_output_port a function template? + fgt_internal_alias_output_port( node, static_cast(port), name_index); +} + +template < typename PortsTuple, int N > +struct fgt_internal_output_alias_helper { + static void alias_port( void *node, PortsTuple &ports ) { + alias_output_port( node, &(std::get(ports)), static_cast(FLOW_OUTPUT_PORT_0 + N - 1) ); + fgt_internal_output_alias_helper::alias_port( node, ports ); + } +}; + +template < typename PortsTuple > +struct fgt_internal_output_alias_helper { + static void alias_port( void * /*node*/, PortsTuple &/*ports*/ ) { + } +}; + +static inline void fgt_internal_create_input_port( void *node, void *p, string_resource_index name_index ) { + itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index ); +} + +static inline void fgt_internal_create_output_port( void* codeptr, void *node, void *p, string_resource_index name_index ) { + itt_make_task_group(ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index); + suppress_unused_warning( codeptr ); +#if __TBB_FLOW_TRACE_CODEPTR + if (codeptr != nullptr) { + register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr); + } +#endif +} + +template +void register_input_port(void *node, receiver* port, string_resource_index name_index) { + // TODO: Make fgt_internal_create_input_port a function template? + fgt_internal_create_input_port(node, static_cast(port), name_index); +} + +template < typename PortsTuple, int N > +struct fgt_internal_input_helper { + static void register_port( void *node, PortsTuple &ports ) { + register_input_port( node, &(std::get(ports)), static_cast(FLOW_INPUT_PORT_0 + N - 1) ); + fgt_internal_input_helper::register_port( node, ports ); + } +}; + +template < typename PortsTuple > +struct fgt_internal_input_helper { + static void register_port( void *node, PortsTuple &ports ) { + register_input_port( node, &(std::get<0>(ports)), FLOW_INPUT_PORT_0 ); + } +}; + +template +void register_output_port(void* codeptr, void *node, sender* port, string_resource_index name_index) { + // TODO: Make fgt_internal_create_output_port a function template? + fgt_internal_create_output_port( codeptr, node, static_cast(port), name_index); +} + +template < typename PortsTuple, int N > +struct fgt_internal_output_helper { + static void register_port( void* codeptr, void *node, PortsTuple &ports ) { + register_output_port( codeptr, node, &(std::get(ports)), static_cast(FLOW_OUTPUT_PORT_0 + N - 1) ); + fgt_internal_output_helper::register_port( codeptr, node, ports ); + } +}; + +template < typename PortsTuple > +struct fgt_internal_output_helper { + static void register_port( void* codeptr, void *node, PortsTuple &ports ) { + register_output_port( codeptr, node, &(std::get<0>(ports)), FLOW_OUTPUT_PORT_0 ); + } +}; + +template< typename NodeType > +void fgt_multioutput_node_desc( const NodeType *node, const char *desc ) { + void *addr = (void *)( static_cast< receiver< typename NodeType::input_type > * >(const_cast< NodeType *>(node)) ); + itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc ); +} + +template< typename NodeType > +void fgt_multiinput_multioutput_node_desc( const NodeType *node, const char *desc ) { + void *addr = const_cast(node); + itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc ); +} + +template< typename NodeType > +static inline void fgt_node_desc( const NodeType *node, const char *desc ) { + void *addr = (void *)( static_cast< sender< typename NodeType::output_type > * >(const_cast< NodeType *>(node)) ); + itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc ); +} + +static inline void fgt_graph_desc( const void *g, const char *desc ) { + void *addr = const_cast< void *>(g); + itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_GRAPH, FLOW_OBJECT_NAME, desc ); +} + +static inline void fgt_body( void *node, void *body ) { + itt_relation_add( ITT_DOMAIN_FLOW, body, FLOW_BODY, __itt_relation_is_child_of, node, FLOW_NODE ); +} + +template< int N, typename PortsTuple > +static inline void fgt_multioutput_node(void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports ) { + itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t ); + fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 ); + fgt_internal_output_helper::register_port(codeptr, input_port, ports ); +} + +template< int N, typename PortsTuple > +static inline void fgt_multioutput_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports, void *body ) { + itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t ); + fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 ); + fgt_internal_output_helper::register_port( codeptr, input_port, ports ); + fgt_body( input_port, body ); +} + +template< int N, typename PortsTuple > +static inline void fgt_multiinput_node( void* codeptr, string_resource_index t, void *g, PortsTuple &ports, void *output_port) { + itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t ); + fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 ); + fgt_internal_input_helper::register_port( output_port, ports ); +} + +static inline void fgt_multiinput_multioutput_node( void* codeptr, string_resource_index t, void *n, void *g ) { + itt_make_task_group( ITT_DOMAIN_FLOW, n, FLOW_NODE, g, FLOW_GRAPH, t ); + suppress_unused_warning( codeptr ); +#if __TBB_FLOW_TRACE_CODEPTR + if (codeptr != nullptr) { + register_node_addr(ITT_DOMAIN_FLOW, n, FLOW_NODE, CODE_ADDRESS, &codeptr); + } +#endif +} + +static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *output_port ) { + itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t ); + fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 ); +} + +static void fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *output_port, void *body ) { + itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t ); + fgt_internal_create_output_port(codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 ); + fgt_body( output_port, body ); +} + +static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *input_port, void *output_port ) { + fgt_node( codeptr, t, g, output_port ); + fgt_internal_create_input_port( output_port, input_port, FLOW_INPUT_PORT_0 ); +} + +static inline void fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, void *output_port, void *body ) { + fgt_node_with_body( codeptr, t, g, output_port, body ); + fgt_internal_create_input_port( output_port, input_port, FLOW_INPUT_PORT_0 ); +} + + +static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *input_port, void *decrement_port, void *output_port ) { + fgt_node( codeptr, t, g, input_port, output_port ); + fgt_internal_create_input_port( output_port, decrement_port, FLOW_INPUT_PORT_1 ); +} + +static inline void fgt_make_edge( void *output_port, void *input_port ) { + itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_predecessor_to, input_port, FLOW_INPUT_PORT); +} + +static inline void fgt_remove_edge( void *output_port, void *input_port ) { + itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_sibling_of, input_port, FLOW_INPUT_PORT); +} + +static inline void fgt_graph( void *g ) { + itt_make_task_group( ITT_DOMAIN_FLOW, g, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_GRAPH ); +} + +static inline void fgt_begin_body( void *body ) { + itt_task_begin( ITT_DOMAIN_FLOW, body, FLOW_BODY, nullptr, FLOW_NULL, FLOW_BODY ); +} + +static inline void fgt_end_body( void * ) { + itt_task_end( ITT_DOMAIN_FLOW ); +} + +static inline void fgt_async_try_put_begin( void *node, void *port ) { + itt_task_begin( ITT_DOMAIN_FLOW, port, FLOW_OUTPUT_PORT, node, FLOW_NODE, FLOW_OUTPUT_PORT ); +} + +static inline void fgt_async_try_put_end( void *, void * ) { + itt_task_end( ITT_DOMAIN_FLOW ); +} + +static inline void fgt_async_reserve( void *node, void *graph ) { + itt_region_begin( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL ); +} + +static inline void fgt_async_commit( void *node, void * /*graph*/) { + itt_region_end( ITT_DOMAIN_FLOW, node, FLOW_NODE ); +} + +static inline void fgt_reserve_wait( void *graph ) { + itt_region_begin( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_NULL ); +} + +static inline void fgt_release_wait( void *graph ) { + itt_region_end( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH ); +} + +#else // TBB_USE_PROFILING_TOOLS + +#define CODEPTR() nullptr + +static inline void fgt_alias_port(void * /*node*/, void * /*p*/, bool /*visible*/ ) { } + +static inline void fgt_composite ( void* /*codeptr*/, void * /*node*/, void * /*graph*/ ) { } + +static inline void fgt_graph( void * /*g*/ ) { } + +template< typename NodeType > +static inline void fgt_multioutput_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { } + +template< typename NodeType > +static inline void fgt_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { } + +static inline void fgt_graph_desc( const void * /*g*/, const char * /*desc*/ ) { } + +template< int N, typename PortsTuple > +static inline void fgt_multioutput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, PortsTuple & /*ports*/ ) { } + +template< int N, typename PortsTuple > +static inline void fgt_multioutput_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, PortsTuple & /*ports*/, void * /*body*/ ) { } + +template< int N, typename PortsTuple > +static inline void fgt_multiinput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, PortsTuple & /*ports*/, void * /*output_port*/ ) { } + +static inline void fgt_multiinput_multioutput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*node*/, void * /*graph*/ ) { } + +static inline void fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*output_port*/ ) { } +static inline void fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*decrement_port*/, void * /*output_port*/ ) { } + +static inline void fgt_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*output_port*/, void * /*body*/ ) { } +static inline void fgt_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*output_port*/, void * /*body*/ ) { } + +static inline void fgt_make_edge( void * /*output_port*/, void * /*input_port*/ ) { } +static inline void fgt_remove_edge( void * /*output_port*/, void * /*input_port*/ ) { } + +static inline void fgt_begin_body( void * /*body*/ ) { } +static inline void fgt_end_body( void * /*body*/) { } + +static inline void fgt_async_try_put_begin( void * /*node*/, void * /*port*/ ) { } +static inline void fgt_async_try_put_end( void * /*node*/ , void * /*port*/ ) { } +static inline void fgt_async_reserve( void * /*node*/, void * /*graph*/ ) { } +static inline void fgt_async_commit( void * /*node*/, void * /*graph*/ ) { } +static inline void fgt_reserve_wait( void * /*graph*/ ) { } +static inline void fgt_release_wait( void * /*graph*/ ) { } + +template< typename NodeType > +void fgt_multiinput_multioutput_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { } + +template < typename PortsTuple, int N > +struct fgt_internal_input_alias_helper { + static void alias_port( void * /*node*/, PortsTuple & /*ports*/ ) { } +}; + +template < typename PortsTuple, int N > +struct fgt_internal_output_alias_helper { + static void alias_port( void * /*node*/, PortsTuple & /*ports*/ ) { } +}; + +#endif // TBB_USE_PROFILING_TOOLS + +} // d1 +} // namespace detail +} // namespace tbb + +#endif // _FGT_GRAPH_TRACE_IMPL_H diff --git a/third_party/tbb/detail/_flow_graph_types_impl.h b/third_party/tbb/detail/_flow_graph_types_impl.h new file mode 100644 index 000000000..3de282c3b --- /dev/null +++ b/third_party/tbb/detail/_flow_graph_types_impl.h @@ -0,0 +1,408 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB__flow_graph_types_impl_H +#define __TBB__flow_graph_types_impl_H + +#ifndef __TBB_flow_graph_H +#error Do not #include this internal file directly; use public TBB headers instead. +#endif + +// included in namespace tbb::detail::d1 + +// the change to key_matching (adding a K and KHash template parameter, making it a class) +// means we have to pass this data to the key_matching_port. All the ports have only one +// template parameter, so we have to wrap the following types in a trait: +// +// . K == key_type +// . KHash == hash and compare for Key +// . TtoK == function_body that given an object of T, returns its K +// . T == type accepted by port, and stored in the hash table +// +// The port will have an additional parameter on node construction, which is a function_body +// that accepts a const T& and returns a K which is the field in T which is its K. +template +struct KeyTrait { + typedef Kp K; + typedef Tp T; + typedef type_to_key_function_body TtoK; + typedef KHashp KHash; +}; + +// wrap each element of a tuple in a template, and make a tuple of the result. +template class PT, typename TypeTuple> +struct wrap_tuple_elements; + +// A wrapper that generates the traits needed for each port of a key-matching join, +// and the type of the tuple of input ports. +template class PT, typename KeyTraits, typename TypeTuple> +struct wrap_key_tuple_elements; + +template class PT, typename... Args> +struct wrap_tuple_elements >{ + typedef typename std::tuple... > type; +}; + +template class PT, typename KeyTraits, typename... Args> +struct wrap_key_tuple_elements > { + typedef typename KeyTraits::key_type K; + typedef typename KeyTraits::hash_compare_type KHash; + typedef typename std::tuple >... > type; +}; + +template< int... S > class sequence {}; + +template< int N, int... S > +struct make_sequence : make_sequence < N - 1, N - 1, S... > {}; + +template< int... S > +struct make_sequence < 0, S... > { + typedef sequence type; +}; + +//! type mimicking std::pair but with trailing fill to ensure each element of an array +//* will have the correct alignment +template +struct type_plus_align { + char first[sizeof(T1)]; + T2 second; + char fill1[REM]; +}; + +template +struct type_plus_align { + char first[sizeof(T1)]; + T2 second; +}; + +template struct alignment_of { + typedef struct { char t; U padded; } test_alignment; + static const size_t value = sizeof(test_alignment) - sizeof(U); +}; + +// T1, T2 are actual types stored. The space defined for T1 in the type returned +// is a char array of the correct size. Type T2 should be trivially-constructible, +// T1 must be explicitly managed. +template +struct aligned_pair { + static const size_t t1_align = alignment_of::value; + static const size_t t2_align = alignment_of::value; + typedef type_plus_align just_pair; + static const size_t max_align = t1_align < t2_align ? t2_align : t1_align; + static const size_t extra_bytes = sizeof(just_pair) % max_align; + static const size_t remainder = extra_bytes ? max_align - extra_bytes : 0; +public: + typedef type_plus_align type; +}; // aligned_pair + +// support for variant type +// type we use when we're not storing a value +struct default_constructed { }; + +// type which contains another type, tests for what type is contained, and references to it. +// Wrapper +// void CopyTo( void *newSpace) : builds a Wrapper copy of itself in newSpace + +// struct to allow us to copy and test the type of objects +struct WrapperBase { + virtual ~WrapperBase() {} + virtual void CopyTo(void* /*newSpace*/) const = 0; +}; + +// Wrapper contains a T, with the ability to test what T is. The Wrapper can be +// constructed from a T, can be copy-constructed from another Wrapper, and can be +// examined via value(), but not modified. +template +struct Wrapper: public WrapperBase { + typedef T value_type; + typedef T* pointer_type; +private: + T value_space; +public: + const value_type &value() const { return value_space; } + +private: + Wrapper(); + + // on exception will ensure the Wrapper will contain only a trivially-constructed object + struct _unwind_space { + pointer_type space; + _unwind_space(pointer_type p) : space(p) {} + ~_unwind_space() { + if(space) (void) new (space) Wrapper(default_constructed()); + } + }; +public: + explicit Wrapper( const T& other ) : value_space(other) { } + explicit Wrapper(const Wrapper& other) = delete; + + void CopyTo(void* newSpace) const override { + _unwind_space guard((pointer_type)newSpace); + (void) new(newSpace) Wrapper(value_space); + guard.space = nullptr; + } + ~Wrapper() { } +}; + +// specialization for array objects +template +struct Wrapper : public WrapperBase { + typedef T value_type; + typedef T* pointer_type; + // space must be untyped. + typedef T ArrayType[N]; +private: + // The space is not of type T[N] because when copy-constructing, it would be + // default-initialized and then copied to in some fashion, resulting in two + // constructions and one destruction per element. If the type is char[ ], we + // placement new into each element, resulting in one construction per element. + static const size_t space_size = sizeof(ArrayType); + char value_space[space_size]; + + + // on exception will ensure the already-built objects will be destructed + // (the value_space is a char array, so it is already trivially-destructible.) + struct _unwind_class { + pointer_type space; + int already_built; + _unwind_class(pointer_type p) : space(p), already_built(0) {} + ~_unwind_class() { + if(space) { + for(size_t i = already_built; i > 0 ; --i ) space[i-1].~value_type(); + (void) new(space) Wrapper(default_constructed()); + } + } + }; +public: + const ArrayType &value() const { + char *vp = const_cast(value_space); + return reinterpret_cast(*vp); + } + +private: + Wrapper(); +public: + // have to explicitly construct because other decays to a const value_type* + explicit Wrapper(const ArrayType& other) { + _unwind_class guard((pointer_type)value_space); + pointer_type vp = reinterpret_cast(&value_space); + for(size_t i = 0; i < N; ++i ) { + (void) new(vp++) value_type(other[i]); + ++(guard.already_built); + } + guard.space = nullptr; + } + explicit Wrapper(const Wrapper& other) : WrapperBase() { + // we have to do the heavy lifting to copy contents + _unwind_class guard((pointer_type)value_space); + pointer_type dp = reinterpret_cast(value_space); + pointer_type sp = reinterpret_cast(const_cast(other.value_space)); + for(size_t i = 0; i < N; ++i, ++dp, ++sp) { + (void) new(dp) value_type(*sp); + ++(guard.already_built); + } + guard.space = nullptr; + } + + void CopyTo(void* newSpace) const override { + (void) new(newSpace) Wrapper(*this); // exceptions handled in copy constructor + } + + ~Wrapper() { + // have to destroy explicitly in reverse order + pointer_type vp = reinterpret_cast(&value_space); + for(size_t i = N; i > 0 ; --i ) vp[i-1].~value_type(); + } +}; + +// given a tuple, return the type of the element that has the maximum alignment requirement. +// Given a tuple and that type, return the number of elements of the object with the max +// alignment requirement that is at least as big as the largest object in the tuple. + +template struct pick_one; +template struct pick_one { typedef T1 type; }; +template struct pick_one { typedef T2 type; }; + +template< template class Selector, typename T1, typename T2 > +struct pick_max { + typedef typename pick_one< (Selector::value > Selector::value), T1, T2 >::type type; +}; + +template struct size_of { static const int value = sizeof(T); }; + +template< size_t N, class Tuple, template class Selector > struct pick_tuple_max { + typedef typename pick_tuple_max::type LeftMaxType; + typedef typename std::tuple_element::type ThisType; + typedef typename pick_max::type type; +}; + +template< class Tuple, template class Selector > struct pick_tuple_max<0, Tuple, Selector> { + typedef typename std::tuple_element<0, Tuple>::type type; +}; + +// is the specified type included in a tuple? +template +struct is_element_of { + typedef typename std::tuple_element::type T_i; + static const bool value = std::is_same::value || is_element_of::value; +}; + +template +struct is_element_of { + typedef typename std::tuple_element<0, Tuple>::type T_i; + static const bool value = std::is_same::value; +}; + +// allow the construction of types that are listed tuple. If a disallowed type +// construction is written, a method involving this type is created. The +// type has no definition, so a syntax error is generated. +template struct ERROR_Type_Not_allowed_In_Tagged_Msg_Not_Member_Of_Tuple; + +template struct do_if; +template +struct do_if { + static void construct(void *mySpace, const T& x) { + (void) new(mySpace) Wrapper(x); + } +}; +template +struct do_if { + static void construct(void * /*mySpace*/, const T& x) { + // This method is instantiated when the type T does not match any of the + // element types in the Tuple in variant. + ERROR_Type_Not_allowed_In_Tagged_Msg_Not_Member_Of_Tuple::bad_type(x); + } +}; + +// Tuple tells us the allowed types that variant can hold. It determines the alignment of the space in +// Wrapper, and how big Wrapper is. +// +// the object can only be tested for type, and a read-only reference can be fetched by cast_to(). + +using tbb::detail::punned_cast; +struct tagged_null_type {}; +template +class tagged_msg { + typedef std::tuple= 6 + , T5 + #endif + #if __TBB_VARIADIC_MAX >= 7 + , T6 + #endif + #if __TBB_VARIADIC_MAX >= 8 + , T7 + #endif + #if __TBB_VARIADIC_MAX >= 9 + , T8 + #endif + #if __TBB_VARIADIC_MAX >= 10 + , T9 + #endif + > Tuple; + +private: + class variant { + static const size_t N = std::tuple_size::value; + typedef typename pick_tuple_max::type AlignType; + typedef typename pick_tuple_max::type MaxSizeType; + static const size_t MaxNBytes = (sizeof(Wrapper)+sizeof(AlignType)-1); + static const size_t MaxNElements = MaxNBytes/sizeof(AlignType); + typedef aligned_space SpaceType; + SpaceType my_space; + static const size_t MaxSize = sizeof(SpaceType); + + public: + variant() { (void) new(&my_space) Wrapper(default_constructed()); } + + template + variant( const T& x ) { + do_if::value>::construct(&my_space,x); + } + + variant(const variant& other) { + const WrapperBase * h = punned_cast(&(other.my_space)); + h->CopyTo(&my_space); + } + + // assignment must destroy and re-create the Wrapper type, as there is no way + // to create a Wrapper-to-Wrapper assign even if we find they agree in type. + void operator=( const variant& rhs ) { + if(&rhs != this) { + WrapperBase *h = punned_cast(&my_space); + h->~WrapperBase(); + const WrapperBase *ch = punned_cast(&(rhs.my_space)); + ch->CopyTo(&my_space); + } + } + + template + const U& variant_cast_to() const { + const Wrapper *h = dynamic_cast*>(punned_cast(&my_space)); + if(!h) { + throw_exception(exception_id::bad_tagged_msg_cast); + } + return h->value(); + } + template + bool variant_is_a() const { return dynamic_cast*>(punned_cast(&my_space)) != nullptr; } + + bool variant_is_default_constructed() const {return variant_is_a();} + + ~variant() { + WrapperBase *h = punned_cast(&my_space); + h->~WrapperBase(); + } + }; //class variant + + TagType my_tag; + variant my_msg; + +public: + tagged_msg(): my_tag(TagType(~0)), my_msg(){} + + template + tagged_msg(T const &index, R const &value) : my_tag(index), my_msg(value) {} + + template + tagged_msg(T const &index, R (&value)[N]) : my_tag(index), my_msg(value) {} + + void set_tag(TagType const &index) {my_tag = index;} + TagType tag() const {return my_tag;} + + template + const V& cast_to() const {return my_msg.template variant_cast_to();} + + template + bool is_a() const {return my_msg.template variant_is_a();} + + bool is_default_constructed() const {return my_msg.variant_is_default_constructed();} +}; //class tagged_msg + +// template to simplify cast and test for tagged_msg in template contexts +template +const V& cast_to(T const &t) { return t.template cast_to(); } + +template +bool is_a(T const &t) { return t.template is_a(); } + +enum op_stat { WAIT = 0, SUCCEEDED, FAILED }; + +#endif /* __TBB__flow_graph_types_impl_H */ diff --git a/third_party/tbb/detail/_hash_compare.h b/third_party/tbb/detail/_hash_compare.h new file mode 100644 index 000000000..2ad1551d0 --- /dev/null +++ b/third_party/tbb/detail/_hash_compare.h @@ -0,0 +1,148 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__hash_compare_H +#define __TBB_detail__hash_compare_H + +#include "third_party/libcxx/functional" + +#include "third_party/tbb/detail/_containers_helpers.h" + +namespace tbb { +namespace detail { +namespace d1 { + +template +class hash_compare { + using is_transparent_hash = has_transparent_key_equal; +public: + using hasher = Hash; + using key_equal = typename is_transparent_hash::type; + + hash_compare() = default; + hash_compare( hasher hash, key_equal equal ) : my_hasher(hash), my_equal(equal) {} + + std::size_t operator()( const Key& key ) const { + return std::size_t(my_hasher(key)); + } + + bool operator()( const Key& key1, const Key& key2 ) const { + return my_equal(key1, key2); + } + + template ::type> + std::size_t operator()( const K& key ) const { + return std::size_t(my_hasher(key)); + } + + template ::type> + bool operator()( const K1& key1, const K2& key2 ) const { + return my_equal(key1, key2); + } + + hasher hash_function() const { + return my_hasher; + } + + key_equal key_eq() const { + return my_equal; + } + + +private: + hasher my_hasher; + key_equal my_equal; +}; // class hash_compare + +//! hash_compare that is default argument for concurrent_hash_map +template +class tbb_hash_compare { +public: + std::size_t hash( const Key& a ) const { return my_hash_func(a); } +#if defined(_MSC_VER) && _MSC_VER <= 1900 +#pragma warning (push) +// MSVC 2015 throws a strange warning: 'std::size_t': forcing value to bool 'true' or 'false' +#pragma warning (disable: 4800) +#endif + bool equal( const Key& a, const Key& b ) const { return my_key_equal(a, b); } +#if defined(_MSC_VER) && _MSC_VER <= 1900 +#pragma warning (pop) +#endif +private: + std::hash my_hash_func; + std::equal_to my_key_equal; +}; + +} // namespace d1 +#if __TBB_CPP20_CONCEPTS_PRESENT +inline namespace d0 { + +template +concept hash_compare = std::copy_constructible && + requires( const std::remove_reference_t& hc, const Key& key1, const Key& key2 ) { + { hc.hash(key1) } -> std::same_as; + { hc.equal(key1, key2) } -> std::convertible_to; + }; + +} // namespace d0 +#endif // __TBB_CPP20_CONCEPTS_PRESENT +} // namespace detail +} // namespace tbb + +#if TBB_DEFINE_STD_HASH_SPECIALIZATIONS + +namespace std { + +template +struct hash> { +public: + std::size_t operator()( const std::pair& p ) const { + return first_hash(p.first) ^ second_hash(p.second); + } + +private: + std::hash first_hash; + std::hash second_hash; +}; // struct hash + +// Apple clang and MSVC defines their own specializations for std::hash> +#if !(_LIBCPP_VERSION) && !(_CPPLIB_VER) + +template +struct hash> { +public: + std::size_t operator()( const std::basic_string& s ) const { + std::size_t h = 0; + for ( const CharT* c = s.c_str(); *c; ++c ) { + h = h * hash_multiplier ^ char_hash(*c); + } + return h; + } + +private: + static constexpr std::size_t hash_multiplier = tbb::detail::select_size_t_constant<2654435769U, 11400714819323198485ULL>::value; + + std::hash char_hash; +}; // struct hash + +#endif // !(_LIBCPP_VERSION || _CPPLIB_VER) + +} // namespace std + +#endif // TBB_DEFINE_STD_HASH_SPECIALIZATIONS + +#endif // __TBB_detail__hash_compare_H diff --git a/third_party/tbb/detail/_intrusive_list_node.h b/third_party/tbb/detail/_intrusive_list_node.h new file mode 100644 index 000000000..d3e1e506b --- /dev/null +++ b/third_party/tbb/detail/_intrusive_list_node.h @@ -0,0 +1,42 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_detail__intrusive_list_node_H +#define _TBB_detail__intrusive_list_node_H + +namespace tbb { +namespace detail { +namespace d1 { + +//! Data structure to be inherited by the types that can form intrusive lists. +/** Intrusive list is formed by means of the member_intrusive_list template class. + Note that type T must derive from intrusive_list_node either publicly or + declare instantiation member_intrusive_list as a friend. + This class implements a limited subset of std::list interface. **/ +struct intrusive_list_node { + intrusive_list_node* my_prev_node{}; + intrusive_list_node* my_next_node{}; +#if TBB_USE_ASSERT + intrusive_list_node() { my_prev_node = my_next_node = this; } +#endif /* TBB_USE_ASSERT */ +}; + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_detail__intrusive_list_node_H diff --git a/third_party/tbb/detail/_machine.h b/third_party/tbb/detail/_machine.h new file mode 100644 index 000000000..5e9df02ba --- /dev/null +++ b/third_party/tbb/detail/_machine.h @@ -0,0 +1,397 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__machine_H +#define __TBB_detail__machine_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_assert.h" + +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/climits" +#include "third_party/libcxx/cstdint" +#include "third_party/libcxx/cstddef" + +#ifdef _WIN32 +// MISSING #include +#ifdef __TBBMALLOC_BUILD +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include "libc/nt/accounting.h" +#include "libc/nt/automation.h" +#include "libc/nt/console.h" +#include "libc/nt/debug.h" +#include "libc/nt/dll.h" +#include "libc/nt/enum/keyaccess.h" +#include "libc/nt/enum/regtype.h" +#include "libc/nt/errors.h" +#include "libc/nt/events.h" +#include "libc/nt/files.h" +#include "libc/nt/ipc.h" +#include "libc/nt/memory.h" +#include "libc/nt/paint.h" +#include "libc/nt/process.h" +#include "libc/nt/registry.h" +#include "libc/nt/synchronization.h" +#include "libc/nt/thread.h" +#include "libc/nt/windows.h" +#include "libc/nt/winsock.h" // SwitchToThread() +#endif +#ifdef _MSC_VER +#if __TBB_x86_64 || __TBB_x86_32 +#pragma intrinsic(__rdtsc) +#endif +#endif +#endif +#if __TBB_x86_64 || __TBB_x86_32 +#include "third_party/intel/immintrin.internal.h" // _mm_pause +#endif +#if (_WIN32) +#include "libc/math.h" +#include "libc/runtime/fenv.h" // _control87 +#endif + +#if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN +#include "libc/calls/calls.h" +#include "libc/calls/struct/cpuset.h" +#include "libc/calls/struct/sched_param.h" +#include "libc/calls/weirdtypes.h" +#include "libc/sysv/consts/sched.h" // sched_yield +#else +#include "third_party/libcxx/thread" // std::this_thread::yield() +#endif + +namespace tbb { +namespace detail { +inline namespace d0 { + +//-------------------------------------------------------------------------------------------------- +// Yield implementation +//-------------------------------------------------------------------------------------------------- + +#if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN +static inline void yield() { + int err = sched_yield(); + __TBB_ASSERT_EX(err == 0, "sched_yield has failed"); +} +#elif __TBBMALLOC_BUILD && _WIN32 +// Use Windows API for yield in tbbmalloc to avoid dependency on C++ runtime with some implementations. +static inline void yield() { + SwitchToThread(); +} +#else +using std::this_thread::yield; +#endif + +//-------------------------------------------------------------------------------------------------- +// atomic_fence_seq_cst implementation +//-------------------------------------------------------------------------------------------------- + +static inline void atomic_fence_seq_cst() { +#if (__TBB_x86_64 || __TBB_x86_32) && defined(__GNUC__) && __GNUC__ < 11 + unsigned char dummy = 0u; + __asm__ __volatile__ ("lock; notb %0" : "+m" (dummy) :: "memory"); +#else + std::atomic_thread_fence(std::memory_order_seq_cst); +#endif +} + +//-------------------------------------------------------------------------------------------------- +// Pause implementation +//-------------------------------------------------------------------------------------------------- + +static inline void machine_pause(int32_t delay) { +#if __TBB_x86_64 || __TBB_x86_32 + while (delay-- > 0) { _mm_pause(); } +#elif __ARM_ARCH_7A__ || __aarch64__ + while (delay-- > 0) { __asm__ __volatile__("yield" ::: "memory"); } +#else /* Generic */ + (void)delay; // suppress without including _template_helpers.h + yield(); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// tbb::detail::log2() implementation +//////////////////////////////////////////////////////////////////////////////////////////////////// +// TODO: Use log2p1() function that will be available in C++20 standard + +#if defined(__GNUC__) || defined(__clang__) +namespace gnu_builtins { + inline uintptr_t clz(unsigned int x) { return static_cast(__builtin_clz(x)); } + inline uintptr_t clz(unsigned long int x) { return static_cast(__builtin_clzl(x)); } + inline uintptr_t clz(unsigned long long int x) { return static_cast(__builtin_clzll(x)); } +} +#elif defined(_MSC_VER) +#pragma intrinsic(__TBB_W(_BitScanReverse)) +namespace msvc_intrinsics { + static inline uintptr_t bit_scan_reverse(uintptr_t i) { + unsigned long j; + __TBB_W(_BitScanReverse)( &j, i ); + return j; + } +} +#endif + +template +constexpr std::uintptr_t number_of_bits() { + return sizeof(T) * CHAR_BIT; +} + +// logarithm is the index of the most significant non-zero bit +static inline uintptr_t machine_log2(uintptr_t x) { +#if defined(__GNUC__) || defined(__clang__) + // If P is a power of 2 and x() - 1) ^ gnu_builtins::clz(x); +#elif defined(_MSC_VER) + return msvc_intrinsics::bit_scan_reverse(x); +#elif __i386__ || __i386 /*for Sun OS*/ || __MINGW32__ + uintptr_t j, i = x; + __asm__("bsr %1,%0" : "=r"(j) : "r"(i)); + return j; +#elif __powerpc__ || __POWERPC__ + #if __TBB_WORDSIZE==8 + __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x)); + return 63 - static_cast(x); + #else + __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x)); + return 31 - static_cast(x); + #endif /*__TBB_WORDSIZE*/ +#elif __sparc + uint64_t count; + // one hot encode + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + x |= (x >> 8); + x |= (x >> 16); + x |= (x >> 32); + // count 1's + __asm__ ("popc %1, %0" : "=r"(count) : "r"(x) ); + return count - 1; +#else + intptr_t result = 0; + + if( sizeof(x) > 4 && (uintptr_t tmp = x >> 32) ) { x = tmp; result += 32; } + if( uintptr_t tmp = x >> 16 ) { x = tmp; result += 16; } + if( uintptr_t tmp = x >> 8 ) { x = tmp; result += 8; } + if( uintptr_t tmp = x >> 4 ) { x = tmp; result += 4; } + if( uintptr_t tmp = x >> 2 ) { x = tmp; result += 2; } + + return (x & 2) ? result + 1 : result; +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// tbb::detail::reverse_bits() implementation +//////////////////////////////////////////////////////////////////////////////////////////////////// +#if TBB_USE_CLANG_BITREVERSE_BUILTINS +namespace llvm_builtins { + inline uint8_t builtin_bitreverse(uint8_t x) { return __builtin_bitreverse8 (x); } + inline uint16_t builtin_bitreverse(uint16_t x) { return __builtin_bitreverse16(x); } + inline uint32_t builtin_bitreverse(uint32_t x) { return __builtin_bitreverse32(x); } + inline uint64_t builtin_bitreverse(uint64_t x) { return __builtin_bitreverse64(x); } +} +#else // generic +template +struct reverse { + static const T byte_table[256]; +}; + +template +const T reverse::byte_table[256] = { + 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, + 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, + 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, + 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, + 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, + 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, + 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, + 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, + 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, + 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, + 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, + 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, + 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, + 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, + 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, + 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF +}; + +inline unsigned char reverse_byte(unsigned char src) { + return reverse::byte_table[src]; +} +#endif // TBB_USE_CLANG_BITREVERSE_BUILTINS + +template +T machine_reverse_bits(T src) { +#if TBB_USE_CLANG_BITREVERSE_BUILTINS + return builtin_bitreverse(fixed_width_cast(src)); +#else /* Generic */ + T dst; + unsigned char *original = reinterpret_cast(&src); + unsigned char *reversed = reinterpret_cast(&dst); + + for ( int i = sizeof(T) - 1; i >= 0; i-- ) { + reversed[i] = reverse_byte( original[sizeof(T) - i - 1] ); + } + + return dst; +#endif // TBB_USE_CLANG_BITREVERSE_BUILTINS +} + +} // inline namespace d0 + +namespace d1 { + +#if (_WIN32) +// API to retrieve/update FPU control setting +#define __TBB_CPU_CTL_ENV_PRESENT 1 +struct cpu_ctl_env { + unsigned int x87cw{}; +#if (__TBB_x86_64) + // Changing the infinity mode or the floating-point precision is not supported on x64. + // The attempt causes an assertion. See + // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/control87-controlfp-control87-2 + static constexpr unsigned int X87CW_CONTROL_MASK = _MCW_DN | _MCW_EM | _MCW_RC; +#else + static constexpr unsigned int X87CW_CONTROL_MASK = ~0U; +#endif +#if (__TBB_x86_32 || __TBB_x86_64) + unsigned int mxcsr{}; + static constexpr unsigned int MXCSR_CONTROL_MASK = ~0x3fu; /* all except last six status bits */ +#endif + + bool operator!=( const cpu_ctl_env& ctl ) const { + return +#if (__TBB_x86_32 || __TBB_x86_64) + mxcsr != ctl.mxcsr || +#endif + x87cw != ctl.x87cw; + } + void get_env() { + x87cw = _control87(0, 0); +#if (__TBB_x86_32 || __TBB_x86_64) + mxcsr = _mm_getcsr(); +#endif + } + void set_env() const { + _control87(x87cw, X87CW_CONTROL_MASK); +#if (__TBB_x86_32 || __TBB_x86_64) + _mm_setcsr(mxcsr & MXCSR_CONTROL_MASK); +#endif + } +}; +#elif (__TBB_x86_32 || __TBB_x86_64) +// API to retrieve/update FPU control setting +#define __TBB_CPU_CTL_ENV_PRESENT 1 +struct cpu_ctl_env { + int mxcsr{}; + short x87cw{}; + static const int MXCSR_CONTROL_MASK = ~0x3f; /* all except last six status bits */ + + bool operator!=(const cpu_ctl_env& ctl) const { + return mxcsr != ctl.mxcsr || x87cw != ctl.x87cw; + } + void get_env() { + __asm__ __volatile__( + "stmxcsr %0\n\t" + "fstcw %1" + : "=m"(mxcsr), "=m"(x87cw) + ); + mxcsr &= MXCSR_CONTROL_MASK; + } + void set_env() const { + __asm__ __volatile__( + "ldmxcsr %0\n\t" + "fldcw %1" + : : "m"(mxcsr), "m"(x87cw) + ); + } +}; +#endif + +} // namespace d1 + +} // namespace detail +} // namespace tbb + +#if !__TBB_CPU_CTL_ENV_PRESENT +#include "libc/runtime/fenv.h" + +#include "third_party/libcxx/cstring" + +namespace tbb { +namespace detail { + +namespace r1 { +void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size); +void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p); +} // namespace r1 + +namespace d1 { + +class cpu_ctl_env { + fenv_t *my_fenv_ptr; +public: + cpu_ctl_env() : my_fenv_ptr(nullptr) {} + ~cpu_ctl_env() { + if ( my_fenv_ptr ) + r1::cache_aligned_deallocate( (void*)my_fenv_ptr ); + } + // It is possible not to copy memory but just to copy pointers but the following issues should be addressed: + // 1. The arena lifetime and the context lifetime are independent; + // 2. The user is allowed to recapture different FPU settings to context so 'current FPU settings' inside + // dispatch loop may become invalid. + // But do we really want to improve the fenv implementation? It seems to be better to replace the fenv implementation + // with a platform specific implementation. + cpu_ctl_env( const cpu_ctl_env &src ) : my_fenv_ptr(nullptr) { + *this = src; + } + cpu_ctl_env& operator=( const cpu_ctl_env &src ) { + __TBB_ASSERT( src.my_fenv_ptr, nullptr); + if ( !my_fenv_ptr ) + my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t)); + *my_fenv_ptr = *src.my_fenv_ptr; + return *this; + } + bool operator!=( const cpu_ctl_env &ctl ) const { + __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." ); + __TBB_ASSERT( ctl.my_fenv_ptr, "cpu_ctl_env is not initialized." ); + return std::memcmp( (void*)my_fenv_ptr, (void*)ctl.my_fenv_ptr, sizeof(fenv_t) ); + } + void get_env () { + if ( !my_fenv_ptr ) + my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t)); + fegetenv( my_fenv_ptr ); + } + const cpu_ctl_env& set_env () const { + __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." ); + fesetenv( my_fenv_ptr ); + return *this; + } +}; + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif /* !__TBB_CPU_CTL_ENV_PRESENT */ + +#endif // __TBB_detail__machine_H diff --git a/third_party/tbb/detail/_mutex_common.h b/third_party/tbb/detail/_mutex_common.h new file mode 100644 index 000000000..3392df0f5 --- /dev/null +++ b/third_party/tbb/detail/_mutex_common.h @@ -0,0 +1,62 @@ +// clang-format off +/* + Copyright (c) 2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__mutex_common_H +#define __TBB_detail__mutex_common_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_utils.h" + +#if __TBB_CPP20_CONCEPTS_PRESENT +// MISSING #include + +namespace tbb { +namespace detail { +inline namespace d0 { + +template +concept mutex_scoped_lock = std::default_initializable && + std::constructible_from && + requires( Lock& lock, Mutex& mutex ) { + lock.acquire(mutex); + { lock.try_acquire(mutex) } -> adaptive_same_as; + lock.release(); + }; + +template +concept rw_mutex_scoped_lock = mutex_scoped_lock && + std::constructible_from && + requires( Lock& lock, Mutex& mutex ) { + lock.acquire(mutex, false); + { lock.try_acquire(mutex, false) } -> adaptive_same_as; + { lock.upgrade_to_writer() } -> adaptive_same_as; + { lock.downgrade_to_reader() } -> adaptive_same_as; + }; + +template +concept scoped_lockable = mutex_scoped_lock; + +template +concept rw_scoped_lockable = scoped_lockable && + rw_mutex_scoped_lock; + +} // namespace d0 +} // namespace detail +} // namespace tbb + +#endif // __TBB_CPP20_CONCEPTS_PRESENT +#endif // __TBB_detail__mutex_common_H diff --git a/third_party/tbb/detail/_namespace_injection.h b/third_party/tbb/detail/_namespace_injection.h new file mode 100644 index 000000000..6b61e4f0d --- /dev/null +++ b/third_party/tbb/detail/_namespace_injection.h @@ -0,0 +1,25 @@ +// clang-format off +/* + Copyright (c) 2020-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// All public entities of the OneAPI Spec are available under oneapi namespace + +// Define tbb namespace first as it might not be known yet +namespace tbb {} + +namespace oneapi { +namespace tbb = ::tbb; +} diff --git a/third_party/tbb/detail/_node_handle.h b/third_party/tbb/detail/_node_handle.h new file mode 100644 index 000000000..29bec49af --- /dev/null +++ b/third_party/tbb/detail/_node_handle.h @@ -0,0 +1,163 @@ +// clang-format off +/* + Copyright (c) 2019-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__node_handle_H +#define __TBB_detail__node_handle_H + +#include "third_party/tbb/detail/_allocator_traits.h" +#include "third_party/tbb/detail/_assert.h" + +namespace tbb { +namespace detail { +namespace d1 { + +// A structure to access private node handle methods in internal TBB classes +// Regular friend declaration is not convenient because classes which use node handle +// can be placed in the different versioning namespaces. +struct node_handle_accessor { + template + static typename NodeHandleType::node* get_node_ptr( NodeHandleType& nh ) { + return nh.get_node_ptr(); + } + + template + static NodeHandleType construct( typename NodeHandleType::node* node_ptr ) { + return NodeHandleType{node_ptr}; + } + + template + static void deactivate( NodeHandleType& nh ) { + nh.deactivate(); + } +}; // struct node_handle_accessor + +template +class node_handle_base { +public: + using allocator_type = Allocator; +protected: + using node = Node; + using allocator_traits_type = tbb::detail::allocator_traits; +public: + + node_handle_base() : my_node(nullptr), my_allocator() {} + node_handle_base(node_handle_base&& nh) : my_node(nh.my_node), + my_allocator(std::move(nh.my_allocator)) { + nh.my_node = nullptr; + } + + __TBB_nodiscard bool empty() const { return my_node == nullptr; } + explicit operator bool() const { return my_node != nullptr; } + + ~node_handle_base() { internal_destroy(); } + + node_handle_base& operator=( node_handle_base&& nh ) { + internal_destroy(); + my_node = nh.my_node; + move_assign_allocators(my_allocator, nh.my_allocator); + nh.deactivate(); + return *this; + } + + void swap( node_handle_base& nh ) { + using std::swap; + swap(my_node, nh.my_node); + swap_allocators(my_allocator, nh.my_allocator); + } + + allocator_type get_allocator() const { + return my_allocator; + } + +protected: + node_handle_base( node* n ) : my_node(n) {} + + void internal_destroy() { + if(my_node != nullptr) { + allocator_traits_type::destroy(my_allocator, my_node->storage()); + typename allocator_traits_type::template rebind_alloc node_allocator(my_allocator); + node_allocator.deallocate(my_node, 1); + } + } + + node* get_node_ptr() { return my_node; } + + void deactivate() { my_node = nullptr; } + + node* my_node; + allocator_type my_allocator; +}; + +// node handle for maps +template +class node_handle : public node_handle_base { + using base_type = node_handle_base; +public: + using key_type = Key; + using mapped_type = typename Value::second_type; + using allocator_type = typename base_type::allocator_type; + + node_handle() = default; + + key_type& key() const { + __TBB_ASSERT(!this->empty(), "Cannot get key from the empty node_type object"); + return *const_cast(&(this->my_node->value().first)); + } + + mapped_type& mapped() const { + __TBB_ASSERT(!this->empty(), "Cannot get mapped value from the empty node_type object"); + return this->my_node->value().second; + } + +private: + friend struct node_handle_accessor; + + node_handle( typename base_type::node* n ) : base_type(n) {} +}; // class node_handle + +// node handle for sets +template +class node_handle : public node_handle_base { + using base_type = node_handle_base; +public: + using value_type = Key; + using allocator_type = typename base_type::allocator_type; + + node_handle() = default; + + value_type& value() const { + __TBB_ASSERT(!this->empty(), "Cannot get value from the empty node_type object"); + return *const_cast(&(this->my_node->value())); + } + +private: + friend struct node_handle_accessor; + + node_handle( typename base_type::node* n ) : base_type(n) {} +}; // class node_handle + +template +void swap( node_handle& lhs, + node_handle& rhs ) { + return lhs.swap(rhs); +} + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_detail__node_handle_H diff --git a/third_party/tbb/detail/_pipeline_filters.h b/third_party/tbb/detail/_pipeline_filters.h new file mode 100644 index 000000000..48f453dbb --- /dev/null +++ b/third_party/tbb/detail/_pipeline_filters.h @@ -0,0 +1,456 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_parallel_filters_H +#define __TBB_parallel_filters_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/detail/_pipeline_filters_deduction.h" +#include "third_party/tbb/tbb_allocator.h" + +#include "third_party/libcxx/cstddef" +#include "third_party/libcxx/cstdint" + +namespace tbb { +namespace detail { + +namespace d1 { +class base_filter; +} + +namespace r1 { +TBB_EXPORT void __TBB_EXPORTED_FUNC set_end_of_input(d1::base_filter&); +class pipeline; +class stage_task; +class input_buffer; +} + +namespace d1 { +class filter_node; + +//! A stage in a pipeline. +/** @ingroup algorithms */ +class base_filter{ +private: + //! Value used to mark "not in pipeline" + static base_filter* not_in_pipeline() { return reinterpret_cast(std::intptr_t(-1)); } +public: + //! The lowest bit 0 is for parallel vs serial + static constexpr unsigned int filter_is_serial = 0x1; + + //! 2nd bit distinguishes ordered vs unordered filters. + static constexpr unsigned int filter_is_out_of_order = 0x1<<1; + + //! 3rd bit marks input filters emitting small objects + static constexpr unsigned int filter_may_emit_null = 0x1<<2; + + base_filter(const base_filter&) = delete; + base_filter& operator=(const base_filter&) = delete; + +protected: + explicit base_filter( unsigned int m ) : + next_filter_in_pipeline(not_in_pipeline()), + my_input_buffer(nullptr), + my_filter_mode(m), + my_pipeline(nullptr) + {} + + // signal end-of-input for concrete_filters + void set_end_of_input() { + r1::set_end_of_input(*this); + } + +public: + //! True if filter is serial. + bool is_serial() const { + return bool( my_filter_mode & filter_is_serial ); + } + + //! True if filter must receive stream in order. + bool is_ordered() const { + return (my_filter_mode & filter_is_serial) && !(my_filter_mode & filter_is_out_of_order); + } + + //! true if an input filter can emit null + bool object_may_be_null() { + return ( my_filter_mode & filter_may_emit_null ) == filter_may_emit_null; + } + + //! Operate on an item from the input stream, and return item for output stream. + /** Returns nullptr if filter is a sink. */ + virtual void* operator()( void* item ) = 0; + + //! Destroy filter. + virtual ~base_filter() {}; + + //! Destroys item if pipeline was cancelled. + /** Required to prevent memory leaks. + Note it can be called concurrently even for serial filters.*/ + virtual void finalize( void* /*item*/ ) {} + +private: + //! Pointer to next filter in the pipeline. + base_filter* next_filter_in_pipeline; + + //! Buffer for incoming tokens, or nullptr if not required. + /** The buffer is required if the filter is serial. */ + r1::input_buffer* my_input_buffer; + + friend class r1::stage_task; + friend class r1::pipeline; + friend void r1::set_end_of_input(d1::base_filter&); + + //! Storage for filter mode and dynamically checked implementation version. + const unsigned int my_filter_mode; + + //! Pointer to the pipeline. + r1::pipeline* my_pipeline; +}; + +template +class concrete_filter; + +//! input_filter control to signal end-of-input for parallel_pipeline +class flow_control { + bool is_pipeline_stopped = false; + flow_control() = default; + template friend class concrete_filter; + template + __TBB_requires(std::copyable) + friend class input_node; +public: + void stop() { is_pipeline_stopped = true; } +}; + +// Emulate std::is_trivially_copyable (false positives not allowed, false negatives suboptimal but safe). +#if __TBB_CPP11_TYPE_PROPERTIES_PRESENT +template using tbb_trivially_copyable = std::is_trivially_copyable; +#else +template struct tbb_trivially_copyable { enum { value = false }; }; +template struct tbb_trivially_copyable < T* > { enum { value = true }; }; +template<> struct tbb_trivially_copyable < bool > { enum { value = true }; }; +template<> struct tbb_trivially_copyable < char > { enum { value = true }; }; +template<> struct tbb_trivially_copyable < signed char > { enum { value = true }; }; +template<> struct tbb_trivially_copyable { enum { value = true }; }; +template<> struct tbb_trivially_copyable < short > { enum { value = true }; }; +template<> struct tbb_trivially_copyable { enum { value = true }; }; +template<> struct tbb_trivially_copyable < int > { enum { value = true }; }; +template<> struct tbb_trivially_copyable { enum { value = true }; }; +template<> struct tbb_trivially_copyable < long > { enum { value = true }; }; +template<> struct tbb_trivially_copyable { enum { value = true }; }; +template<> struct tbb_trivially_copyable < long long> { enum { value = true }; }; +template<> struct tbb_trivially_copyable { enum { value = true }; }; +template<> struct tbb_trivially_copyable < float > { enum { value = true }; }; +template<> struct tbb_trivially_copyable < double > { enum { value = true }; }; +template<> struct tbb_trivially_copyable < long double > { enum { value = true }; }; +#endif // __TBB_CPP11_TYPE_PROPERTIES_PRESENT + +template +struct use_allocator { + static constexpr bool value = sizeof(T) > sizeof(void *) || !tbb_trivially_copyable::value; +}; + +// A helper class to customize how a type is passed between filters. +// Usage: token_helper::value> +template struct token_helper; + +// using tbb_allocator +template +struct token_helper { + using pointer = T*; + using value_type = T; + static pointer create_token(value_type && source) { + return new (r1::allocate_memory(sizeof(T))) T(std::move(source)); + } + static value_type & token(pointer & t) { return *t; } + static void * cast_to_void_ptr(pointer ref) { return reinterpret_cast(ref); } + static pointer cast_from_void_ptr(void * ref) { return reinterpret_cast(ref); } + static void destroy_token(pointer token) { + token->~value_type(); + r1::deallocate_memory(token); + } +}; + +// pointer specialization +template +struct token_helper { + using pointer = T*; + using value_type = T*; + static pointer create_token(const value_type & source) { return source; } + static value_type & token(pointer & t) { return t; } + static void * cast_to_void_ptr(pointer ref) { return reinterpret_cast(ref); } + static pointer cast_from_void_ptr(void * ref) { return reinterpret_cast(ref); } + static void destroy_token( pointer /*token*/) {} +}; + +// converting type to and from void*, passing objects directly +template +struct token_helper { + typedef union { + T actual_value; + void * void_overlay; + } type_to_void_ptr_map; + using pointer = T; // not really a pointer in this case. + using value_type = T; + static pointer create_token(const value_type & source) { return source; } + static value_type & token(pointer & t) { return t; } + static void * cast_to_void_ptr(pointer ref) { + type_to_void_ptr_map mymap; + mymap.void_overlay = nullptr; + mymap.actual_value = ref; + return mymap.void_overlay; + } + static pointer cast_from_void_ptr(void * ref) { + type_to_void_ptr_map mymap; + mymap.void_overlay = ref; + return mymap.actual_value; + } + static void destroy_token( pointer /*token*/) {} +}; + +// intermediate +template +class concrete_filter: public base_filter { + const Body& my_body; + using input_helper = token_helper::value>; + using input_pointer = typename input_helper::pointer; + using output_helper = token_helper::value>; + using output_pointer = typename output_helper::pointer; + + void* operator()(void* input) override { + input_pointer temp_input = input_helper::cast_from_void_ptr(input); + output_pointer temp_output = output_helper::create_token(tbb::detail::invoke(my_body, std::move(input_helper::token(temp_input)))); + input_helper::destroy_token(temp_input); + return output_helper::cast_to_void_ptr(temp_output); + } + + void finalize(void * input) override { + input_pointer temp_input = input_helper::cast_from_void_ptr(input); + input_helper::destroy_token(temp_input); + } + +public: + concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {} +}; + +// input +template +class concrete_filter: public base_filter { + const Body& my_body; + using output_helper = token_helper::value>; + using output_pointer = typename output_helper::pointer; + + void* operator()(void*) override { + flow_control control; + output_pointer temp_output = output_helper::create_token(my_body(control)); + if(control.is_pipeline_stopped) { + output_helper::destroy_token(temp_output); + set_end_of_input(); + return nullptr; + } + return output_helper::cast_to_void_ptr(temp_output); + } + +public: + concrete_filter(unsigned int m, const Body& body) : + base_filter(m | filter_may_emit_null), + my_body(body) + {} +}; + +// output +template +class concrete_filter: public base_filter { + const Body& my_body; + using input_helper = token_helper::value>; + using input_pointer = typename input_helper::pointer; + + void* operator()(void* input) override { + input_pointer temp_input = input_helper::cast_from_void_ptr(input); + tbb::detail::invoke(my_body, std::move(input_helper::token(temp_input))); + input_helper::destroy_token(temp_input); + return nullptr; + } + void finalize(void* input) override { + input_pointer temp_input = input_helper::cast_from_void_ptr(input); + input_helper::destroy_token(temp_input); + } + +public: + concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {} +}; + +template +class concrete_filter: public base_filter { + const Body& my_body; + + void* operator()(void*) override { + flow_control control; + my_body(control); + void* output = control.is_pipeline_stopped ? nullptr : (void*)(std::intptr_t)-1; + return output; + } +public: + concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {} +}; + +class filter_node_ptr { + filter_node * my_node; + +public: + filter_node_ptr() : my_node(nullptr) {} + filter_node_ptr(filter_node *); + ~filter_node_ptr(); + filter_node_ptr(const filter_node_ptr &); + filter_node_ptr(filter_node_ptr &&); + void operator=(filter_node *); + void operator=(const filter_node_ptr &); + void operator=(filter_node_ptr &&); + filter_node& operator*() const; + operator bool() const; +}; + +//! Abstract base class that represents a node in a parse tree underlying a filter class. +/** These nodes are always heap-allocated and can be shared by filter objects. */ +class filter_node { + /** Count must be atomic because it is hidden state for user, but might be shared by threads. */ + std::atomic ref_count; +public: + filter_node_ptr left; + filter_node_ptr right; +protected: + filter_node() : ref_count(0), left(nullptr), right(nullptr) { +#ifdef __TBB_TEST_FILTER_NODE_COUNT + ++(__TBB_TEST_FILTER_NODE_COUNT); +#endif + } +public: + filter_node(const filter_node_ptr& x, const filter_node_ptr& y) : filter_node(){ + left = x; + right = y; + } + filter_node(const filter_node&) = delete; + filter_node& operator=(const filter_node&) = delete; + + //! Add concrete_filter to pipeline + virtual base_filter* create_filter() const { + __TBB_ASSERT(false, "method of non-leaf was called"); + return nullptr; + } + + //! Increment reference count + void add_ref() { ref_count.fetch_add(1, std::memory_order_relaxed); } + + //! Decrement reference count and delete if it becomes zero. + void remove_ref() { + __TBB_ASSERT(ref_count>0,"ref_count underflow"); + if( ref_count.fetch_sub(1, std::memory_order_relaxed) == 1 ) { + this->~filter_node(); + r1::deallocate_memory(this); + } + } + + virtual ~filter_node() { +#ifdef __TBB_TEST_FILTER_NODE_COUNT + --(__TBB_TEST_FILTER_NODE_COUNT); +#endif + } +}; + +inline filter_node_ptr::filter_node_ptr(filter_node * nd) : my_node(nd) { + if (my_node) { + my_node->add_ref(); + } +} + +inline filter_node_ptr::~filter_node_ptr() { + if (my_node) { + my_node->remove_ref(); + } +} + +inline filter_node_ptr::filter_node_ptr(const filter_node_ptr & rhs) : my_node(rhs.my_node) { + if (my_node) { + my_node->add_ref(); + } +} + +inline filter_node_ptr::filter_node_ptr(filter_node_ptr && rhs) : my_node(rhs.my_node) { + rhs.my_node = nullptr; +} + +inline void filter_node_ptr::operator=(filter_node * rhs) { + // Order of operations below carefully chosen so that reference counts remain correct + // in unlikely event that remove_ref throws exception. + filter_node* old = my_node; + my_node = rhs; + if (my_node) { + my_node->add_ref(); + } + if (old) { + old->remove_ref(); + } +} + +inline void filter_node_ptr::operator=(const filter_node_ptr & rhs) { + *this = rhs.my_node; +} + +inline void filter_node_ptr::operator=(filter_node_ptr && rhs) { + filter_node* old = my_node; + my_node = rhs.my_node; + rhs.my_node = nullptr; + if (old) { + old->remove_ref(); + } +} + +inline filter_node& filter_node_ptr::operator*() const{ + __TBB_ASSERT(my_node,"nullptr node is used"); + return *my_node; +} + +inline filter_node_ptr::operator bool() const { + return my_node != nullptr; +} + +//! Node in parse tree representing result of make_filter. +template +class filter_node_leaf: public filter_node { + const unsigned int my_mode; + const Body my_body; + base_filter* create_filter() const override { + return new(r1::allocate_memory(sizeof(concrete_filter))) concrete_filter(my_mode,my_body); + } +public: + filter_node_leaf( unsigned int m, const Body& b ) : my_mode(m), my_body(b) {} +}; + + +template ::input_type> +using filter_input = typename std::conditional::value, void, Input>::type; + +template +using filter_output = typename filter_body_types::output_type; + +} // namespace d1 +} // namespace detail +} // namespace tbb + + +#endif /* __TBB_parallel_filters_H */ diff --git a/third_party/tbb/detail/_pipeline_filters_deduction.h b/third_party/tbb/detail/_pipeline_filters_deduction.h new file mode 100644 index 000000000..ad183f4cb --- /dev/null +++ b/third_party/tbb/detail/_pipeline_filters_deduction.h @@ -0,0 +1,47 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB__pipeline_filters_deduction_H +#define __TBB__pipeline_filters_deduction_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/libcxx/utility" +#include "third_party/libcxx/type_traits" + +namespace tbb { +namespace detail { +namespace d1 { + +template +struct declare_filter_types { + using input_type = typename std::remove_const::type>::type; + using output_type = typename std::remove_const::type>::type; +}; + +template struct filter_body_types; + +template +struct filter_body_types : declare_filter_types {}; + +template +struct filter_body_types : declare_filter_types {}; + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif // __TBB__pipeline_filters_deduction_H diff --git a/third_party/tbb/detail/_range_common.h b/third_party/tbb/detail/_range_common.h new file mode 100644 index 000000000..15f4d2bea --- /dev/null +++ b/third_party/tbb/detail/_range_common.h @@ -0,0 +1,131 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__range_common_H +#define __TBB_detail__range_common_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_utils.h" +#if __TBB_CPP20_CONCEPTS_PRESENT +// MISSING #include +#endif +#include "third_party/libcxx/iterator" + +namespace tbb { +namespace detail { +inline namespace d0 { + +//! Dummy type that distinguishes splitting constructor from copy constructor. +/** + * See description of parallel_for and parallel_reduce for example usages. + * @ingroup algorithms + */ +class split {}; + +//! Type enables transmission of splitting proportion from partitioners to range objects +/** + * In order to make use of such facility Range objects must implement + * splitting constructor with this type passed. + */ +class proportional_split : no_assign { +public: + proportional_split(size_t _left = 1, size_t _right = 1) : my_left(_left), my_right(_right) { } + + size_t left() const { return my_left; } + size_t right() const { return my_right; } + + // used when range does not support proportional split + explicit operator split() const { return split(); } + +private: + size_t my_left, my_right; +}; + +template +struct range_split_object_provider { + template + static split get( PartitionerSplitType& ) { return split(); } +}; + +template +struct range_split_object_provider::value>::type> { + template + static PartitionerSplitType& get( PartitionerSplitType& split_obj ) { return split_obj; } +}; + +template +auto get_range_split_object( PartitionerSplitType& split_obj ) +-> decltype(range_split_object_provider::get(split_obj)) { + return range_split_object_provider::get(split_obj); +} + +template +using range_iterator_type = decltype(std::begin(std::declval())); + +#if __TBB_CPP20_CONCEPTS_PRESENT +template +using iterator_reference_type = typename std::iterator_traits::reference; + +template +using range_reference_type = iterator_reference_type>; + +template +concept blocked_range_value = std::copyable && + requires( const std::remove_reference_t& lhs, const std::remove_reference_t& rhs ) { + { lhs < rhs } -> relaxed_convertible_to; + { lhs - rhs } -> std::convertible_to; + { lhs + (rhs - lhs) } -> std::convertible_to; + }; + +template +concept splittable = std::constructible_from; + +template +concept tbb_range = std::copy_constructible && + splittable && + requires( const std::remove_reference_t& range ) { + { range.empty() } -> relaxed_convertible_to; + { range.is_divisible() } -> relaxed_convertible_to; + }; + +template +constexpr bool iterator_concept_helper( std::input_iterator_tag ) { + return std::input_iterator; +} + +template +constexpr bool iterator_concept_helper( std::random_access_iterator_tag ) { + return std::random_access_iterator; +} + +template +concept iterator_satisfies = requires (IteratorTag tag) { + requires iterator_concept_helper(tag); +}; + +template +concept container_based_sequence = requires( Sequence& seq ) { + { std::begin(seq) } -> iterator_satisfies; + { std::end(seq) } -> iterator_satisfies; +}; +#endif // __TBB_CPP20_CONCEPTS_PRESENT +} // namespace d0 +} // namespace detail +} // namespace tbb + +#endif // __TBB_detail__range_common_H diff --git a/third_party/tbb/detail/_rtm_mutex.h b/third_party/tbb/detail/_rtm_mutex.h new file mode 100644 index 000000000..0633bb6f6 --- /dev/null +++ b/third_party/tbb/detail/_rtm_mutex.h @@ -0,0 +1,163 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB__rtm_mutex_impl_H +#define __TBB__rtm_mutex_impl_H + +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/spin_mutex.h" + +#include "third_party/tbb/profiling.h" + +namespace tbb { +namespace detail { +namespace r1 { +struct rtm_mutex_impl; +} +namespace d1 { + +#if _MSC_VER && !defined(__INTEL_COMPILER) + // Suppress warning: structure was padded due to alignment specifier + #pragma warning (push) + #pragma warning (disable: 4324) +#endif + +/** A rtm_mutex is an speculation-enabled spin mutex. + It should be used for locking short critical sections where the lock is + contended but the data it protects are not. If zero-initialized, the + mutex is considered unheld. + @ingroup synchronization */ +class alignas(max_nfs_size) rtm_mutex : private spin_mutex { +private: + enum class rtm_state { + rtm_none, + rtm_transacting, + rtm_real + }; +public: + //! Constructors + rtm_mutex() noexcept { + create_itt_sync(this, "tbb::speculative_spin_mutex", ""); + } + + //! Destructor + ~rtm_mutex() = default; + + //! Represents acquisition of a mutex. + class scoped_lock { + public: + friend class rtm_mutex; + //! Construct lock that has not acquired a mutex. + /** Equivalent to zero-initialization of *this. */ + constexpr scoped_lock() : m_mutex(nullptr), m_transaction_state(rtm_state::rtm_none) {} + + //! Acquire lock on given mutex. + scoped_lock(rtm_mutex& m) : m_mutex(nullptr), m_transaction_state(rtm_state::rtm_none) { + acquire(m); + } + + //! Release lock (if lock is held). + ~scoped_lock() { + if(m_transaction_state != rtm_state::rtm_none) { + release(); + } + } + + //! No Copy + scoped_lock(const scoped_lock&) = delete; + scoped_lock& operator=(const scoped_lock&) = delete; + + //! Acquire lock on given mutex. + void acquire(rtm_mutex& m); + + //! Try acquire lock on given mutex. + bool try_acquire(rtm_mutex& m); + + //! Release lock + void release(); + + private: + rtm_mutex* m_mutex; + rtm_state m_transaction_state; + friend r1::rtm_mutex_impl; + }; + + //! Mutex traits + static constexpr bool is_rw_mutex = false; + static constexpr bool is_recursive_mutex = false; + static constexpr bool is_fair_mutex = false; +private: + friend r1::rtm_mutex_impl; +}; // end of rtm_mutex +} // namespace d1 + +namespace r1 { + //! Internal acquire lock. + // only_speculate == true if we're doing a try_lock, else false. + TBB_EXPORT void __TBB_EXPORTED_FUNC acquire(d1::rtm_mutex&, d1::rtm_mutex::scoped_lock&, bool only_speculate = false); + //! Internal try_acquire lock. + TBB_EXPORT bool __TBB_EXPORTED_FUNC try_acquire(d1::rtm_mutex&, d1::rtm_mutex::scoped_lock&); + //! Internal release lock. + TBB_EXPORT void __TBB_EXPORTED_FUNC release(d1::rtm_mutex::scoped_lock&); +} // namespace r1 + +namespace d1 { +//! Acquire lock on given mutex. +inline void rtm_mutex::scoped_lock::acquire(rtm_mutex& m) { + __TBB_ASSERT(!m_mutex, "lock is already acquired"); + r1::acquire(m, *this); +} + +//! Try acquire lock on given mutex. +inline bool rtm_mutex::scoped_lock::try_acquire(rtm_mutex& m) { + __TBB_ASSERT(!m_mutex, "lock is already acquired"); + return r1::try_acquire(m, *this); +} + +//! Release lock +inline void rtm_mutex::scoped_lock::release() { + __TBB_ASSERT(m_mutex, "lock is not acquired"); + __TBB_ASSERT(m_transaction_state != rtm_state::rtm_none, "lock is not acquired"); + return r1::release(*this); +} + +#if _MSC_VER && !defined(__INTEL_COMPILER) + #pragma warning (pop) // 4324 warning +#endif + +#if TBB_USE_PROFILING_TOOLS +inline void set_name(rtm_mutex& obj, const char* name) { + itt_set_sync_name(&obj, name); +} +#if (_WIN32||_WIN64) +inline void set_name(rtm_mutex& obj, const wchar_t* name) { + itt_set_sync_name(&obj, name); +} +#endif // WIN +#else +inline void set_name(rtm_mutex&, const char*) {} +#if (_WIN32||_WIN64) +inline void set_name(rtm_mutex&, const wchar_t*) {} +#endif // WIN +#endif + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif /* __TBB__rtm_mutex_impl_H */ diff --git a/third_party/tbb/detail/_rtm_rw_mutex.h b/third_party/tbb/detail/_rtm_rw_mutex.h new file mode 100644 index 000000000..2f2d53e49 --- /dev/null +++ b/third_party/tbb/detail/_rtm_rw_mutex.h @@ -0,0 +1,216 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__rtm_rw_mutex_H +#define __TBB_detail__rtm_rw_mutex_H + +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/spin_rw_mutex.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { + +namespace r1 { +struct rtm_rw_mutex_impl; +} + +namespace d1 { + +constexpr std::size_t speculation_granularity = 64; +#if _MSC_VER && !defined(__INTEL_COMPILER) + // Suppress warning: structure was padded due to alignment specifier + #pragma warning (push) + #pragma warning (disable: 4324) +#endif + +//! Fast, unfair, spinning speculation-enabled reader-writer lock with backoff and writer-preference +/** @ingroup synchronization */ +class alignas(max_nfs_size) rtm_rw_mutex : private spin_rw_mutex { + friend struct r1::rtm_rw_mutex_impl; +private: + enum class rtm_type { + rtm_not_in_mutex, + rtm_transacting_reader, + rtm_transacting_writer, + rtm_real_reader, + rtm_real_writer + }; +public: + //! Constructors + rtm_rw_mutex() noexcept : write_flag(false) { + create_itt_sync(this, "tbb::speculative_spin_rw_mutex", ""); + } + + //! Destructor + ~rtm_rw_mutex() = default; + + //! Represents acquisition of a mutex. + class scoped_lock { + friend struct r1::rtm_rw_mutex_impl; + public: + //! Construct lock that has not acquired a mutex. + /** Equivalent to zero-initialization of *this. */ + constexpr scoped_lock() : m_mutex(nullptr), m_transaction_state(rtm_type::rtm_not_in_mutex) {} + + //! Acquire lock on given mutex. + scoped_lock(rtm_rw_mutex& m, bool write = true) : m_mutex(nullptr), m_transaction_state(rtm_type::rtm_not_in_mutex) { + acquire(m, write); + } + + //! Release lock (if lock is held). + ~scoped_lock() { + if(m_transaction_state != rtm_type::rtm_not_in_mutex) { + release(); + } + } + + //! No Copy + scoped_lock(const scoped_lock&) = delete; + scoped_lock& operator=(const scoped_lock&) = delete; + + //! Acquire lock on given mutex. + inline void acquire(rtm_rw_mutex& m, bool write = true); + + //! Try acquire lock on given mutex. + inline bool try_acquire(rtm_rw_mutex& m, bool write = true); + + //! Release lock + inline void release(); + + //! Upgrade reader to become a writer. + /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ + inline bool upgrade_to_writer(); + + //! Downgrade writer to become a reader. + inline bool downgrade_to_reader(); + + inline bool is_writer() const; + private: + rtm_rw_mutex* m_mutex; + rtm_type m_transaction_state; + }; + + //! Mutex traits + static constexpr bool is_rw_mutex = true; + static constexpr bool is_recursive_mutex = false; + static constexpr bool is_fair_mutex = false; + +private: + alignas(speculation_granularity) std::atomic write_flag; +}; + +#if _MSC_VER && !defined(__INTEL_COMPILER) + #pragma warning (pop) // 4324 warning +#endif + +} // namespace d1 + +namespace r1 { + //! Internal acquire write lock. + // only_speculate == true if we're doing a try_lock, else false. + TBB_EXPORT void __TBB_EXPORTED_FUNC acquire_writer(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&, bool only_speculate = false); + //! Internal acquire read lock. + // only_speculate == true if we're doing a try_lock, else false. + TBB_EXPORT void __TBB_EXPORTED_FUNC acquire_reader(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&, bool only_speculate = false); + //! Internal upgrade reader to become a writer. + TBB_EXPORT bool __TBB_EXPORTED_FUNC upgrade(d1::rtm_rw_mutex::scoped_lock&); + //! Internal downgrade writer to become a reader. + TBB_EXPORT bool __TBB_EXPORTED_FUNC downgrade(d1::rtm_rw_mutex::scoped_lock&); + //! Internal try_acquire write lock. + TBB_EXPORT bool __TBB_EXPORTED_FUNC try_acquire_writer(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&); + //! Internal try_acquire read lock. + TBB_EXPORT bool __TBB_EXPORTED_FUNC try_acquire_reader(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&); + //! Internal release lock. + TBB_EXPORT void __TBB_EXPORTED_FUNC release(d1::rtm_rw_mutex::scoped_lock&); +} + +namespace d1 { +//! Acquire lock on given mutex. +void rtm_rw_mutex::scoped_lock::acquire(rtm_rw_mutex& m, bool write) { + __TBB_ASSERT(!m_mutex, "lock is already acquired"); + if (write) { + r1::acquire_writer(m, *this); + } else { + r1::acquire_reader(m, *this); + } +} + +//! Try acquire lock on given mutex. +bool rtm_rw_mutex::scoped_lock::try_acquire(rtm_rw_mutex& m, bool write) { + __TBB_ASSERT(!m_mutex, "lock is already acquired"); + if (write) { + return r1::try_acquire_writer(m, *this); + } else { + return r1::try_acquire_reader(m, *this); + } +} + +//! Release lock +void rtm_rw_mutex::scoped_lock::release() { + __TBB_ASSERT(m_mutex, "lock is not acquired"); + __TBB_ASSERT(m_transaction_state != rtm_type::rtm_not_in_mutex, "lock is not acquired"); + return r1::release(*this); +} + +//! Upgrade reader to become a writer. +/** Returns whether the upgrade happened without releasing and re-acquiring the lock */ +bool rtm_rw_mutex::scoped_lock::upgrade_to_writer() { + __TBB_ASSERT(m_mutex, "lock is not acquired"); + if (m_transaction_state == rtm_type::rtm_transacting_writer || m_transaction_state == rtm_type::rtm_real_writer) { + return true; // Already a writer + } + return r1::upgrade(*this); +} + +//! Downgrade writer to become a reader. +bool rtm_rw_mutex::scoped_lock::downgrade_to_reader() { + __TBB_ASSERT(m_mutex, "lock is not acquired"); + if (m_transaction_state == rtm_type::rtm_transacting_reader || m_transaction_state == rtm_type::rtm_real_reader) { + return true; // Already a reader + } + return r1::downgrade(*this); +} + +bool rtm_rw_mutex::scoped_lock::is_writer() const { + __TBB_ASSERT(m_mutex, "lock is not acquired"); + return m_transaction_state == rtm_type::rtm_transacting_writer || m_transaction_state == rtm_type::rtm_real_writer; +} + +#if TBB_USE_PROFILING_TOOLS +inline void set_name(rtm_rw_mutex& obj, const char* name) { + itt_set_sync_name(&obj, name); +} +#if (_WIN32||_WIN64) +inline void set_name(rtm_rw_mutex& obj, const wchar_t* name) { + itt_set_sync_name(&obj, name); +} +#endif // WIN +#else +inline void set_name(rtm_rw_mutex&, const char*) {} +#if (_WIN32||_WIN64) +inline void set_name(rtm_rw_mutex&, const wchar_t*) {} +#endif // WIN +#endif + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_detail__rtm_rw_mutex_H diff --git a/third_party/tbb/detail/_scoped_lock.h b/third_party/tbb/detail/_scoped_lock.h new file mode 100644 index 000000000..640d15d10 --- /dev/null +++ b/third_party/tbb/detail/_scoped_lock.h @@ -0,0 +1,175 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail_scoped_lock_H +#define __TBB_detail_scoped_lock_H + +namespace tbb { +namespace detail { +namespace d1 { + +// unique_scoped_lock supposes that Mutex operations never throw +template +class unique_scoped_lock { + //! Points to currently held Mutex, or nullptr if no lock is held. + Mutex* m_mutex{}; + +public: + //! Construct without acquiring a Mutex. + constexpr unique_scoped_lock() noexcept : m_mutex(nullptr) {} + + //! Construct and acquire lock on a Mutex. + unique_scoped_lock(Mutex& m) { + acquire(m); + } + + //! No Copy + unique_scoped_lock(const unique_scoped_lock&) = delete; + unique_scoped_lock& operator=(const unique_scoped_lock&) = delete; + + //! Acquire lock. + void acquire(Mutex& m) { + __TBB_ASSERT(m_mutex == nullptr, "The mutex is already acquired"); + m_mutex = &m; + m.lock(); + } + + //! Try acquiring lock (non-blocking) + /** Return true if lock acquired; false otherwise. */ + bool try_acquire(Mutex& m) { + __TBB_ASSERT(m_mutex == nullptr, "The mutex is already acquired"); + bool succeed = m.try_lock(); + if (succeed) { + m_mutex = &m; + } + return succeed; + } + + //! Release lock + void release() { + __TBB_ASSERT(m_mutex, "release on Mutex::unique_scoped_lock that is not holding a lock"); + m_mutex->unlock(); + m_mutex = nullptr; + } + + //! Destroy lock. If holding a lock, releases the lock first. + ~unique_scoped_lock() { + if (m_mutex) { + release(); + } + } +}; + +// rw_scoped_lock supposes that Mutex operations never throw +template +class rw_scoped_lock { +public: + //! Construct lock that has not acquired a mutex. + /** Equivalent to zero-initialization of *this. */ + constexpr rw_scoped_lock() noexcept {} + + //! Acquire lock on given mutex. + rw_scoped_lock(Mutex& m, bool write = true) { + acquire(m, write); + } + + //! Release lock (if lock is held). + ~rw_scoped_lock() { + if (m_mutex) { + release(); + } + } + + //! No Copy + rw_scoped_lock(const rw_scoped_lock&) = delete; + rw_scoped_lock& operator=(const rw_scoped_lock&) = delete; + + //! Acquire lock on given mutex. + void acquire(Mutex& m, bool write = true) { + __TBB_ASSERT(m_mutex == nullptr, "The mutex is already acquired"); + m_is_writer = write; + m_mutex = &m; + if (write) { + m_mutex->lock(); + } else { + m_mutex->lock_shared(); + } + } + + //! Try acquire lock on given mutex. + bool try_acquire(Mutex& m, bool write = true) { + bool succeed = write ? m.try_lock() : m.try_lock_shared(); + if (succeed) { + m_mutex = &m; + m_is_writer = write; + } + return succeed; + } + + //! Release lock. + void release() { + __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired"); + Mutex* m = m_mutex; + m_mutex = nullptr; + + if (m_is_writer) { + m->unlock(); + } else { + m->unlock_shared(); + } + } + + //! Upgrade reader to become a writer. + /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ + bool upgrade_to_writer() { + __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired"); + if (m_is_writer) { + return true; // Already a writer + } + m_is_writer = true; + return m_mutex->upgrade(); + } + + //! Downgrade writer to become a reader. + bool downgrade_to_reader() { + __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired"); + if (m_is_writer) { + m_mutex->downgrade(); + m_is_writer = false; + } + return true; + } + + bool is_writer() const { + __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired"); + return m_is_writer; + } + +protected: + //! The pointer to the current mutex that is held, or nullptr if no mutex is held. + Mutex* m_mutex {nullptr}; + + //! If mutex != nullptr, then is_writer is true if holding a writer lock, false if holding a reader lock. + /** Not defined if not holding a lock. */ + bool m_is_writer {false}; +}; + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_detail_scoped_lock_H diff --git a/third_party/tbb/detail/_segment_table.h b/third_party/tbb/detail/_segment_table.h new file mode 100644 index 000000000..a9f570a72 --- /dev/null +++ b/third_party/tbb/detail/_segment_table.h @@ -0,0 +1,567 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__segment_table_H +#define __TBB_detail__segment_table_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_allocator_traits.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/type_traits" +#include "third_party/libcxx/memory" +#include "third_party/libcxx/cstring" + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma warning(push) +#pragma warning(disable: 4127) // warning C4127: conditional expression is constant +#endif + +namespace tbb { +namespace detail { +namespace d1 { + +template +class segment_table { +public: + using value_type = T; + using segment_type = T*; + using atomic_segment = std::atomic; + using segment_table_type = atomic_segment*; + + using size_type = std::size_t; + using segment_index_type = std::size_t; + + using allocator_type = Allocator; + + using allocator_traits_type = tbb::detail::allocator_traits; + using segment_table_allocator_type = typename allocator_traits_type::template rebind_alloc; +protected: + using segment_table_allocator_traits = tbb::detail::allocator_traits; + using derived_type = DerivedType; + + static constexpr size_type pointers_per_embedded_table = PointersPerEmbeddedTable; + static constexpr size_type pointers_per_long_table = sizeof(size_type) * 8; +public: + segment_table( const allocator_type& alloc = allocator_type() ) + : my_segment_table_allocator(alloc), my_segment_table(nullptr) + , my_first_block{}, my_size{}, my_segment_table_allocation_failed{} + { + my_segment_table.store(my_embedded_table, std::memory_order_relaxed); + zero_table(my_embedded_table, pointers_per_embedded_table); + } + + segment_table( const segment_table& other ) + : my_segment_table_allocator(segment_table_allocator_traits:: + select_on_container_copy_construction(other.my_segment_table_allocator)) + , my_segment_table(nullptr), my_first_block{}, my_size{}, my_segment_table_allocation_failed{} + { + my_segment_table.store(my_embedded_table, std::memory_order_relaxed); + zero_table(my_embedded_table, pointers_per_embedded_table); + try_call( [&] { + internal_transfer(other, copy_segment_body_type{*this}); + } ).on_exception( [&] { + clear(); + }); + } + + segment_table( const segment_table& other, const allocator_type& alloc ) + : my_segment_table_allocator(alloc), my_segment_table(nullptr) + , my_first_block{}, my_size{}, my_segment_table_allocation_failed{} + { + my_segment_table.store(my_embedded_table, std::memory_order_relaxed); + zero_table(my_embedded_table, pointers_per_embedded_table); + try_call( [&] { + internal_transfer(other, copy_segment_body_type{*this}); + } ).on_exception( [&] { + clear(); + }); + } + + segment_table( segment_table&& other ) + : my_segment_table_allocator(std::move(other.my_segment_table_allocator)), my_segment_table(nullptr) + , my_first_block{}, my_size{}, my_segment_table_allocation_failed{} + { + my_segment_table.store(my_embedded_table, std::memory_order_relaxed); + zero_table(my_embedded_table, pointers_per_embedded_table); + internal_move(std::move(other)); + } + + segment_table( segment_table&& other, const allocator_type& alloc ) + : my_segment_table_allocator(alloc), my_segment_table(nullptr), my_first_block{} + , my_size{}, my_segment_table_allocation_failed{} + { + my_segment_table.store(my_embedded_table, std::memory_order_relaxed); + zero_table(my_embedded_table, pointers_per_embedded_table); + using is_equal_type = typename segment_table_allocator_traits::is_always_equal; + internal_move_construct_with_allocator(std::move(other), alloc, is_equal_type()); + } + + ~segment_table() { + clear(); + } + + segment_table& operator=( const segment_table& other ) { + if (this != &other) { + copy_assign_allocators(my_segment_table_allocator, other.my_segment_table_allocator); + internal_transfer(other, copy_segment_body_type{*this}); + } + return *this; + } + + segment_table& operator=( segment_table&& other ) + noexcept(derived_type::is_noexcept_assignment) + { + using pocma_type = typename segment_table_allocator_traits::propagate_on_container_move_assignment; + using is_equal_type = typename segment_table_allocator_traits::is_always_equal; + + if (this != &other) { + move_assign_allocators(my_segment_table_allocator, other.my_segment_table_allocator); + internal_move_assign(std::move(other), tbb::detail::disjunction()); + } + return *this; + } + + void swap( segment_table& other ) + noexcept(derived_type::is_noexcept_swap) + { + using is_equal_type = typename segment_table_allocator_traits::is_always_equal; + using pocs_type = typename segment_table_allocator_traits::propagate_on_container_swap; + + if (this != &other) { + swap_allocators(my_segment_table_allocator, other.my_segment_table_allocator); + internal_swap(other, tbb::detail::disjunction()); + } + } + + segment_type get_segment( segment_index_type index ) const { + return get_table()[index] + segment_base(index); + } + + value_type& operator[]( size_type index ) { + return internal_subscript(index); + } + + const value_type& operator[]( size_type index ) const { + return const_cast(this)->internal_subscript(index); + } + + const segment_table_allocator_type& get_allocator() const { + return my_segment_table_allocator; + } + + segment_table_allocator_type& get_allocator() { + return my_segment_table_allocator; + } + + void enable_segment( segment_type& segment, segment_table_type table, segment_index_type seg_index, size_type index ) { + // Allocate new segment + segment_type new_segment = self()->create_segment(table, seg_index, index); + if (new_segment != nullptr) { + // Store (new_segment - segment_base) into the segment table to allow access to the table by index via + // my_segment_table[segment_index_of(index)][index] + segment_type disabled_segment = nullptr; + if (!table[seg_index].compare_exchange_strong(disabled_segment, new_segment - segment_base(seg_index))) { + // compare_exchange failed => some other thread has already enabled this segment + // Deallocate the memory + self()->deallocate_segment(new_segment, seg_index); + } + } + + segment = table[seg_index].load(std::memory_order_acquire); + __TBB_ASSERT(segment != nullptr, "If create_segment returned nullptr, the element should be stored in the table"); + } + + void delete_segment( segment_index_type seg_index ) { + segment_type segment_to_delete = self()->nullify_segment(get_table(), seg_index); + if (segment_to_delete == segment_allocation_failure_tag) { + return; + } + + segment_to_delete += segment_base(seg_index); + + // Deallocate the segment + self()->destroy_segment(segment_to_delete, seg_index); + } + + size_type number_of_segments( segment_table_type table ) const { + // Check for an active table, if it is embedded table - return the number of embedded segments + // Otherwise - return the maximum number of segments + return table == my_embedded_table ? pointers_per_embedded_table : pointers_per_long_table; + } + + size_type capacity() const noexcept { + segment_table_type table = get_table(); + size_type num_segments = number_of_segments(table); + for (size_type seg_index = 0; seg_index < num_segments; ++seg_index) { + // Check if the pointer is valid (allocated) + if (table[seg_index].load(std::memory_order_relaxed) <= segment_allocation_failure_tag) { + return segment_base(seg_index); + } + } + return segment_base(num_segments); + } + + size_type find_last_allocated_segment( segment_table_type table ) const noexcept { + size_type end = 0; + size_type num_segments = number_of_segments(table); + for (size_type seg_index = 0; seg_index < num_segments; ++seg_index) { + // Check if the pointer is valid (allocated) + if (table[seg_index].load(std::memory_order_relaxed) > segment_allocation_failure_tag) { + end = seg_index + 1; + } + } + return end; + } + + void reserve( size_type n ) { + if (n > allocator_traits_type::max_size(my_segment_table_allocator)) { + throw_exception(exception_id::reservation_length_error); + } + + size_type size = my_size.load(std::memory_order_relaxed); + segment_index_type start_seg_idx = size == 0 ? 0 : segment_index_of(size - 1) + 1; + for (segment_index_type seg_idx = start_seg_idx; segment_base(seg_idx) < n; ++seg_idx) { + size_type first_index = segment_base(seg_idx); + internal_subscript(first_index); + } + } + + void clear() { + clear_segments(); + clear_table(); + my_size.store(0, std::memory_order_relaxed); + my_first_block.store(0, std::memory_order_relaxed); + } + + void clear_segments() { + segment_table_type current_segment_table = get_table(); + for (size_type i = number_of_segments(current_segment_table); i != 0; --i) { + if (current_segment_table[i - 1].load(std::memory_order_relaxed) != nullptr) { + // If the segment was enabled - disable and deallocate it + delete_segment(i - 1); + } + } + } + + void clear_table() { + segment_table_type current_segment_table = get_table(); + if (current_segment_table != my_embedded_table) { + // If the active table is not the embedded one - deallocate the active table + for (size_type i = 0; i != pointers_per_long_table; ++i) { + segment_table_allocator_traits::destroy(my_segment_table_allocator, ¤t_segment_table[i]); + } + + segment_table_allocator_traits::deallocate(my_segment_table_allocator, current_segment_table, pointers_per_long_table); + my_segment_table.store(my_embedded_table, std::memory_order_relaxed); + zero_table(my_embedded_table, pointers_per_embedded_table); + } + } + + void extend_table_if_necessary(segment_table_type& table, size_type start_index, size_type end_index) { + // extend_segment_table if an active table is an embedded table + // and the requested index is not in the embedded table + if (table == my_embedded_table && end_index > embedded_table_size) { + if (start_index <= embedded_table_size) { + try_call([&] { + table = self()->allocate_long_table(my_embedded_table, start_index); + // It is possible that the table was extended by the thread that allocated first_block. + // In this case it is necessary to re-read the current table. + + if (table) { + my_segment_table.store(table, std::memory_order_release); + } else { + table = my_segment_table.load(std::memory_order_acquire); + } + }).on_exception([&] { + my_segment_table_allocation_failed.store(true, std::memory_order_relaxed); + }); + } else { + atomic_backoff backoff; + do { + if (my_segment_table_allocation_failed.load(std::memory_order_relaxed)) { + throw_exception(exception_id::bad_alloc); + } + backoff.pause(); + table = my_segment_table.load(std::memory_order_acquire); + } while (table == my_embedded_table); + } + } + } + + // Return the segment where index is stored + static constexpr segment_index_type segment_index_of( size_type index ) { + return size_type(tbb::detail::log2(uintptr_t(index|1))); + } + + // Needed to calculate the offset in segment + static constexpr size_type segment_base( size_type index ) { + return size_type(1) << index & ~size_type(1); + } + + // Return size of the segment + static constexpr size_type segment_size( size_type index ) { + return index == 0 ? 2 : size_type(1) << index; + } + +private: + + derived_type* self() { + return static_cast(this); + } + + struct copy_segment_body_type { + void operator()( segment_index_type index, segment_type from, segment_type to ) const { + my_instance.self()->copy_segment(index, from, to); + } + segment_table& my_instance; + }; + + struct move_segment_body_type { + void operator()( segment_index_type index, segment_type from, segment_type to ) const { + my_instance.self()->move_segment(index, from, to); + } + segment_table& my_instance; + }; + + // Transgers all segments from the other table + template + void internal_transfer( const segment_table& other, TransferBody transfer_segment ) { + static_cast(this)->destroy_elements(); + + assign_first_block_if_necessary(other.my_first_block.load(std::memory_order_relaxed)); + my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); + + segment_table_type other_table = other.get_table(); + size_type end_segment_size = segment_size(other.find_last_allocated_segment(other_table)); + + // If an exception occurred in other, then the size may be greater than the size of the end segment. + size_type other_size = end_segment_size < other.my_size.load(std::memory_order_relaxed) ? + other.my_size.load(std::memory_order_relaxed) : end_segment_size; + other_size = my_segment_table_allocation_failed ? embedded_table_size : other_size; + + for (segment_index_type i = 0; segment_base(i) < other_size; ++i) { + // If the segment in other table is enabled - transfer it + if (other_table[i].load(std::memory_order_relaxed) == segment_allocation_failure_tag) + { + my_size = segment_base(i); + break; + } else if (other_table[i].load(std::memory_order_relaxed) != nullptr) { + internal_subscript(segment_base(i)); + transfer_segment(i, other.get_table()[i].load(std::memory_order_relaxed) + segment_base(i), + get_table()[i].load(std::memory_order_relaxed) + segment_base(i)); + } + } + } + + // Moves the other segment table + // Only equal allocators are allowed + void internal_move( segment_table&& other ) { + // NOTE: allocators should be equal + clear(); + my_first_block.store(other.my_first_block.load(std::memory_order_relaxed), std::memory_order_relaxed); + my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); + // If an active table in other is embedded - restore all of the embedded segments + if (other.get_table() == other.my_embedded_table) { + for ( size_type i = 0; i != pointers_per_embedded_table; ++i ) { + segment_type other_segment = other.my_embedded_table[i].load(std::memory_order_relaxed); + my_embedded_table[i].store(other_segment, std::memory_order_relaxed); + other.my_embedded_table[i].store(nullptr, std::memory_order_relaxed); + } + my_segment_table.store(my_embedded_table, std::memory_order_relaxed); + } else { + my_segment_table.store(other.my_segment_table, std::memory_order_relaxed); + other.my_segment_table.store(other.my_embedded_table, std::memory_order_relaxed); + zero_table(other.my_embedded_table, pointers_per_embedded_table); + } + other.my_size.store(0, std::memory_order_relaxed); + } + + // Move construct the segment table with the allocator object + // if any instances of allocator_type are always equal + void internal_move_construct_with_allocator( segment_table&& other, const allocator_type&, + /*is_always_equal = */ std::true_type ) { + internal_move(std::move(other)); + } + + // Move construct the segment table with the allocator object + // if any instances of allocator_type are always equal + void internal_move_construct_with_allocator( segment_table&& other, const allocator_type& alloc, + /*is_always_equal = */ std::false_type ) { + if (other.my_segment_table_allocator == alloc) { + // If allocators are equal - restore pointers + internal_move(std::move(other)); + } else { + // If allocators are not equal - perform per element move with reallocation + try_call( [&] { + internal_transfer(other, move_segment_body_type{*this}); + } ).on_exception( [&] { + clear(); + }); + } + } + + // Move assigns the segment table to other is any instances of allocator_type are always equal + // or propagate_on_container_move_assignment is true + void internal_move_assign( segment_table&& other, /*is_always_equal || POCMA = */ std::true_type ) { + internal_move(std::move(other)); + } + + // Move assigns the segment table to other is any instances of allocator_type are not always equal + // and propagate_on_container_move_assignment is false + void internal_move_assign( segment_table&& other, /*is_always_equal || POCMA = */ std::false_type ) { + if (my_segment_table_allocator == other.my_segment_table_allocator) { + // If allocators are equal - restore pointers + internal_move(std::move(other)); + } else { + // If allocators are not equal - perform per element move with reallocation + internal_transfer(other, move_segment_body_type{*this}); + } + } + + // Swaps two segment tables if any instances of allocator_type are always equal + // or propagate_on_container_swap is true + void internal_swap( segment_table& other, /*is_always_equal || POCS = */ std::true_type ) { + internal_swap_fields(other); + } + + // Swaps two segment tables if any instances of allocator_type are not always equal + // and propagate_on_container_swap is false + // According to the C++ standard, swapping of two containers with unequal allocators + // is an undefined behavior scenario + void internal_swap( segment_table& other, /*is_always_equal || POCS = */ std::false_type ) { + __TBB_ASSERT(my_segment_table_allocator == other.my_segment_table_allocator, + "Swapping with unequal allocators is not allowed"); + internal_swap_fields(other); + } + + void internal_swap_fields( segment_table& other ) { + // If an active table in either *this segment table or other is an embedded one - swaps the embedded tables + if (get_table() == my_embedded_table || + other.get_table() == other.my_embedded_table) { + + for (size_type i = 0; i != pointers_per_embedded_table; ++i) { + segment_type current_segment = my_embedded_table[i].load(std::memory_order_relaxed); + segment_type other_segment = other.my_embedded_table[i].load(std::memory_order_relaxed); + + my_embedded_table[i].store(other_segment, std::memory_order_relaxed); + other.my_embedded_table[i].store(current_segment, std::memory_order_relaxed); + } + } + + segment_table_type current_segment_table = get_table(); + segment_table_type other_segment_table = other.get_table(); + + // If an active table is an embedded one - + // store an active table in other to the embedded one from other + if (current_segment_table == my_embedded_table) { + other.my_segment_table.store(other.my_embedded_table, std::memory_order_relaxed); + } else { + // Otherwise - store it to the active segment table + other.my_segment_table.store(current_segment_table, std::memory_order_relaxed); + } + + // If an active table in other segment table is an embedded one - + // store an active table in other to the embedded one from *this + if (other_segment_table == other.my_embedded_table) { + my_segment_table.store(my_embedded_table, std::memory_order_relaxed); + } else { + // Otherwise - store it to the active segment table in other + my_segment_table.store(other_segment_table, std::memory_order_relaxed); + } + auto first_block = other.my_first_block.load(std::memory_order_relaxed); + other.my_first_block.store(my_first_block.load(std::memory_order_relaxed), std::memory_order_relaxed); + my_first_block.store(first_block, std::memory_order_relaxed); + + auto size = other.my_size.load(std::memory_order_relaxed); + other.my_size.store(my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); + my_size.store(size, std::memory_order_relaxed); + } + +protected: + // A flag indicates that an exception was throws during segment allocations + const segment_type segment_allocation_failure_tag = reinterpret_cast(1); + static constexpr size_type embedded_table_size = segment_size(pointers_per_embedded_table); + + template + value_type& internal_subscript( size_type index ) { + segment_index_type seg_index = segment_index_of(index); + segment_table_type table = my_segment_table.load(std::memory_order_acquire); + segment_type segment = nullptr; + + if (allow_out_of_range_access) { + if (derived_type::allow_table_extending) { + extend_table_if_necessary(table, index, index + 1); + } + + segment = table[seg_index].load(std::memory_order_acquire); + // If the required segment is disabled - enable it + if (segment == nullptr) { + enable_segment(segment, table, seg_index, index); + } + // Check if an exception was thrown during segment allocation + if (segment == segment_allocation_failure_tag) { + throw_exception(exception_id::bad_alloc); + } + } else { + segment = table[seg_index].load(std::memory_order_acquire); + } + __TBB_ASSERT(segment != nullptr, nullptr); + + return segment[index]; + } + + void assign_first_block_if_necessary(segment_index_type index) { + size_type zero = 0; + if (this->my_first_block.load(std::memory_order_relaxed) == zero) { + this->my_first_block.compare_exchange_strong(zero, index); + } + } + + void zero_table( segment_table_type table, size_type count ) { + for (size_type i = 0; i != count; ++i) { + table[i].store(nullptr, std::memory_order_relaxed); + } + } + + segment_table_type get_table() const { + return my_segment_table.load(std::memory_order_acquire); + } + + segment_table_allocator_type my_segment_table_allocator; + std::atomic my_segment_table; + atomic_segment my_embedded_table[pointers_per_embedded_table]; + // Number of segments in first block + std::atomic my_first_block; + // Number of elements in table + std::atomic my_size; + // Flag to indicate failed extend table + std::atomic my_segment_table_allocation_failed; +}; // class segment_table + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#pragma warning(pop) // warning 4127 is back +#endif + +#endif // __TBB_detail__segment_table_H diff --git a/third_party/tbb/detail/_small_object_pool.h b/third_party/tbb/detail/_small_object_pool.h new file mode 100644 index 000000000..114858597 --- /dev/null +++ b/third_party/tbb/detail/_small_object_pool.h @@ -0,0 +1,109 @@ +// clang-format off +/* + Copyright (c) 2020-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB__small_object_pool_H +#define __TBB__small_object_pool_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_assert.h" + +#include "third_party/tbb/profiling.h" +#include "third_party/libcxx/cstddef" +#include "third_party/libcxx/cstdint" +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { + +namespace d1 { +class small_object_pool { +protected: + small_object_pool() = default; +}; +struct execution_data; +} + +namespace r1 { +TBB_EXPORT void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& pool, std::size_t number_of_bytes, + const d1::execution_data& ed); +TBB_EXPORT void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& pool, std::size_t number_of_bytes); +TBB_EXPORT void __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& pool, void* ptr, std::size_t number_of_bytes, + const d1::execution_data& ed); +TBB_EXPORT void __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& pool, void* ptr, std::size_t number_of_bytes); +} + +namespace d1 { +class small_object_allocator { +public: + template + Type* new_object(execution_data& ed, Args&&... args) { + void* allocated_object = r1::allocate(m_pool, sizeof(Type), ed); + + auto constructed_object = new(allocated_object) Type(std::forward(args)...); + return constructed_object; + } + + template + Type* new_object(Args&&... args) { + void* allocated_object = r1::allocate(m_pool, sizeof(Type)); + + auto constructed_object = new(allocated_object) Type(std::forward(args)...); + return constructed_object; + } + + template + void delete_object(Type* object, const execution_data& ed) { + // Copy this since it can be a member of the passed object and + // unintentionally destroyed when Type destructor is called below + small_object_allocator alloc = *this; + object->~Type(); + alloc.deallocate(object, ed); + } + + template + void delete_object(Type* object) { + // Copy this since it can be a member of the passed object and + // unintentionally destroyed when Type destructor is called below + small_object_allocator alloc = *this; + object->~Type(); + alloc.deallocate(object); + } + + template + void deallocate(Type* ptr, const execution_data& ed) { + call_itt_task_notify(destroy, ptr); + + __TBB_ASSERT(m_pool != nullptr, "Pool must be valid for deallocate call"); + r1::deallocate(*m_pool, ptr, sizeof(Type), ed); + } + + template + void deallocate(Type* ptr) { + call_itt_task_notify(destroy, ptr); + + __TBB_ASSERT(m_pool != nullptr, "Pool must be valid for deallocate call"); + r1::deallocate(*m_pool, ptr, sizeof(Type)); + } +private: + small_object_pool* m_pool{}; +}; + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif /* __TBB__small_object_pool_H */ diff --git a/third_party/tbb/detail/_string_resource.h b/third_party/tbb/detail/_string_resource.h new file mode 100644 index 000000000..d1dd46d1b --- /dev/null +++ b/third_party/tbb/detail/_string_resource.h @@ -0,0 +1,79 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +TBB_STRING_RESOURCE(ALGORITHM, "tbb_algorithm") +TBB_STRING_RESOURCE(PARALLEL_FOR, "tbb_parallel_for") +TBB_STRING_RESOURCE(PARALLEL_FOR_EACH, "tbb_parallel_for_each") +TBB_STRING_RESOURCE(PARALLEL_INVOKE, "tbb_parallel_invoke") +TBB_STRING_RESOURCE(PARALLEL_REDUCE, "tbb_parallel_reduce") +TBB_STRING_RESOURCE(PARALLEL_SCAN, "tbb_parallel_scan") +TBB_STRING_RESOURCE(PARALLEL_SORT, "tbb_parallel_sort") +TBB_STRING_RESOURCE(PARALLEL_PIPELINE, "tbb_parallel_pipeline") +TBB_STRING_RESOURCE(CUSTOM_CTX, "tbb_custom") + +TBB_STRING_RESOURCE(FLOW_NULL, "null") +TBB_STRING_RESOURCE(FLOW_BROADCAST_NODE, "broadcast_node") +TBB_STRING_RESOURCE(FLOW_BUFFER_NODE, "buffer_node") +TBB_STRING_RESOURCE(FLOW_CONTINUE_NODE, "continue_node") +TBB_STRING_RESOURCE(FLOW_FUNCTION_NODE, "function_node") +TBB_STRING_RESOURCE(FLOW_JOIN_NODE_QUEUEING, "join_node (queueing)") +TBB_STRING_RESOURCE(FLOW_JOIN_NODE_RESERVING, "join_node (reserving)") +TBB_STRING_RESOURCE(FLOW_JOIN_NODE_TAG_MATCHING, "join_node (tag_matching)") +TBB_STRING_RESOURCE(FLOW_LIMITER_NODE, "limiter_node") +TBB_STRING_RESOURCE(FLOW_MULTIFUNCTION_NODE, "multifunction_node") +TBB_STRING_RESOURCE(FLOW_OVERWRITE_NODE, "overwrite_node") +TBB_STRING_RESOURCE(FLOW_PRIORITY_QUEUE_NODE, "priority_queue_node") +TBB_STRING_RESOURCE(FLOW_QUEUE_NODE, "queue_node") +TBB_STRING_RESOURCE(FLOW_SEQUENCER_NODE, "sequencer_node") +TBB_STRING_RESOURCE(FLOW_INPUT_NODE, "input_node") +TBB_STRING_RESOURCE(FLOW_SPLIT_NODE, "split_node") +TBB_STRING_RESOURCE(FLOW_WRITE_ONCE_NODE, "write_once_node") +TBB_STRING_RESOURCE(FLOW_INDEXER_NODE, "indexer_node") +TBB_STRING_RESOURCE(FLOW_COMPOSITE_NODE, "composite_node") +TBB_STRING_RESOURCE(FLOW_ASYNC_NODE, "async_node") +TBB_STRING_RESOURCE(FLOW_INPUT_PORT, "input_port") +TBB_STRING_RESOURCE(FLOW_INPUT_PORT_0, "input_port_0") +TBB_STRING_RESOURCE(FLOW_INPUT_PORT_1, "input_port_1") +TBB_STRING_RESOURCE(FLOW_INPUT_PORT_2, "input_port_2") +TBB_STRING_RESOURCE(FLOW_INPUT_PORT_3, "input_port_3") +TBB_STRING_RESOURCE(FLOW_INPUT_PORT_4, "input_port_4") +TBB_STRING_RESOURCE(FLOW_INPUT_PORT_5, "input_port_5") +TBB_STRING_RESOURCE(FLOW_INPUT_PORT_6, "input_port_6") +TBB_STRING_RESOURCE(FLOW_INPUT_PORT_7, "input_port_7") +TBB_STRING_RESOURCE(FLOW_INPUT_PORT_8, "input_port_8") +TBB_STRING_RESOURCE(FLOW_INPUT_PORT_9, "input_port_9") +TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT, "output_port") +TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_0, "output_port_0") +TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_1, "output_port_1") +TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_2, "output_port_2") +TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_3, "output_port_3") +TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_4, "output_port_4") +TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_5, "output_port_5") +TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_6, "output_port_6") +TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_7, "output_port_7") +TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_8, "output_port_8") +TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_9, "output_port_9") +TBB_STRING_RESOURCE(FLOW_OBJECT_NAME, "object_name") +TBB_STRING_RESOURCE(FLOW_BODY, "body") +TBB_STRING_RESOURCE(FLOW_GRAPH, "graph") +TBB_STRING_RESOURCE(FLOW_NODE, "node") +TBB_STRING_RESOURCE(FLOW_TASKS, "tbb_flow_graph") +TBB_STRING_RESOURCE(USER_EVENT, "user_event") + +#if __TBB_FLOW_TRACE_CODEPTR +TBB_STRING_RESOURCE(CODE_ADDRESS, "code_address") +#endif diff --git a/third_party/tbb/detail/_task.h b/third_party/tbb/detail/_task.h new file mode 100644 index 000000000..0413c0bed --- /dev/null +++ b/third_party/tbb/detail/_task.h @@ -0,0 +1,233 @@ +// clang-format off +/* + Copyright (c) 2020-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB__task_H +#define __TBB__task_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_small_object_pool.h" + +#include "third_party/tbb/profiling.h" + +#include "third_party/libcxx/cstddef" +#include "third_party/libcxx/cstdint" +#include "third_party/libcxx/climits" +#include "third_party/libcxx/utility" +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/mutex" + +namespace tbb { +namespace detail { + +namespace d1 { +using slot_id = unsigned short; +constexpr slot_id no_slot = slot_id(~0); +constexpr slot_id any_slot = slot_id(~1); + +class task; +class wait_context; +class task_group_context; +struct execution_data; +} + +namespace r1 { +//! Task spawn/wait entry points +TBB_EXPORT void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx); +TBB_EXPORT void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx, d1::slot_id id); +TBB_EXPORT void __TBB_EXPORTED_FUNC execute_and_wait(d1::task& t, d1::task_group_context& t_ctx, d1::wait_context&, d1::task_group_context& w_ctx); +TBB_EXPORT void __TBB_EXPORTED_FUNC wait(d1::wait_context&, d1::task_group_context& ctx); +TBB_EXPORT d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::execution_data*); +TBB_EXPORT d1::task_group_context* __TBB_EXPORTED_FUNC current_context(); + +// Do not place under __TBB_RESUMABLE_TASKS. It is a stub for unsupported platforms. +struct suspend_point_type; +using suspend_callback_type = void(*)(void*, suspend_point_type*); +//! The resumable tasks entry points +TBB_EXPORT void __TBB_EXPORTED_FUNC suspend(suspend_callback_type suspend_callback, void* user_callback); +TBB_EXPORT void __TBB_EXPORTED_FUNC resume(suspend_point_type* tag); +TBB_EXPORT suspend_point_type* __TBB_EXPORTED_FUNC current_suspend_point(); +TBB_EXPORT void __TBB_EXPORTED_FUNC notify_waiters(std::uintptr_t wait_ctx_addr); + +class thread_data; +class task_dispatcher; +class external_waiter; +struct task_accessor; +struct task_arena_impl; +} // namespace r1 + +namespace d1 { + +class task_arena; +using suspend_point = r1::suspend_point_type*; + +#if __TBB_RESUMABLE_TASKS +template +static void suspend_callback(void* user_callback, suspend_point sp) { + // Copy user function to a new stack after the context switch to avoid a race when the previous + // suspend point is resumed while the user_callback is being called. + F user_callback_copy = *static_cast(user_callback); + user_callback_copy(sp); +} + +template +void suspend(F f) { + r1::suspend(&suspend_callback, &f); +} + +inline void resume(suspend_point tag) { + r1::resume(tag); +} +#endif /* __TBB_RESUMABLE_TASKS */ + +// TODO align wait_context on cache lane +class wait_context { + static constexpr std::uint64_t overflow_mask = ~((1LLU << 32) - 1); + + std::uint64_t m_version_and_traits{1}; + std::atomic m_ref_count{}; + + void add_reference(std::int64_t delta) { + call_itt_task_notify(releasing, this); + std::uint64_t r = m_ref_count.fetch_add(static_cast(delta)) + static_cast(delta); + + __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected"); + + if (!r) { + // Some external waiters or coroutine waiters sleep in wait list + // Should to notify them that work is done + std::uintptr_t wait_ctx_addr = std::uintptr_t(this); + r1::notify_waiters(wait_ctx_addr); + } + } + + bool continue_execution() const { + std::uint64_t r = m_ref_count.load(std::memory_order_acquire); + __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected"); + return r > 0; + } + + friend class r1::thread_data; + friend class r1::task_dispatcher; + friend class r1::external_waiter; + friend class task_group; + friend class task_group_base; + friend struct r1::task_arena_impl; + friend struct r1::suspend_point_type; +public: + // Despite the internal reference count is uin64_t we limit the user interface with uint32_t + // to preserve a part of the internal reference count for special needs. + wait_context(std::uint32_t ref_count) : m_ref_count{ref_count} { suppress_unused_warning(m_version_and_traits); } + wait_context(const wait_context&) = delete; + + ~wait_context() { + __TBB_ASSERT(!continue_execution(), nullptr); + } + + void reserve(std::uint32_t delta = 1) { + add_reference(delta); + } + + void release(std::uint32_t delta = 1) { + add_reference(-std::int64_t(delta)); + } +}; + +struct execution_data { + task_group_context* context{}; + slot_id original_slot{}; + slot_id affinity_slot{}; +}; + +inline task_group_context* context(const execution_data& ed) { + return ed.context; +} + +inline slot_id original_slot(const execution_data& ed) { + return ed.original_slot; +} + +inline slot_id affinity_slot(const execution_data& ed) { + return ed.affinity_slot; +} + +inline slot_id execution_slot(const execution_data& ed) { + return r1::execution_slot(&ed); +} + +inline bool is_same_affinity(const execution_data& ed) { + return affinity_slot(ed) == no_slot || affinity_slot(ed) == execution_slot(ed); +} + +inline bool is_stolen(const execution_data& ed) { + return original_slot(ed) != execution_slot(ed); +} + +inline void spawn(task& t, task_group_context& ctx) { + call_itt_task_notify(releasing, &t); + r1::spawn(t, ctx); +} + +inline void spawn(task& t, task_group_context& ctx, slot_id id) { + call_itt_task_notify(releasing, &t); + r1::spawn(t, ctx, id); +} + +inline void execute_and_wait(task& t, task_group_context& t_ctx, wait_context& wait_ctx, task_group_context& w_ctx) { + r1::execute_and_wait(t, t_ctx, wait_ctx, w_ctx); + call_itt_task_notify(acquired, &wait_ctx); + call_itt_task_notify(destroy, &wait_ctx); +} + +inline void wait(wait_context& wait_ctx, task_group_context& ctx) { + r1::wait(wait_ctx, ctx); + call_itt_task_notify(acquired, &wait_ctx); + call_itt_task_notify(destroy, &wait_ctx); +} + +using r1::current_context; + +class task_traits { + std::uint64_t m_version_and_traits{}; + friend struct r1::task_accessor; +}; + +//! Alignment for a task object +static constexpr std::size_t task_alignment = 64; + +//! Base class for user-defined tasks. +/** @ingroup task_scheduling */ +class alignas(task_alignment) task : public task_traits { +protected: + virtual ~task() = default; + +public: + virtual task* execute(execution_data&) = 0; + virtual task* cancel(execution_data&) = 0; + +private: + std::uint64_t m_reserved[6]{}; + friend struct r1::task_accessor; +}; +static_assert(sizeof(task) == task_alignment, "task size is broken"); + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif /* __TBB__task_H */ diff --git a/third_party/tbb/detail/_task_handle.h b/third_party/tbb/detail/_task_handle.h new file mode 100644 index 000000000..c7bf32992 --- /dev/null +++ b/third_party/tbb/detail/_task_handle.h @@ -0,0 +1,123 @@ +// clang-format off +/* + Copyright (c) 2020-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + + +#ifndef __TBB_task_handle_H +#define __TBB_task_handle_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/detail/_small_object_pool.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/libcxx/memory" + +namespace tbb { +namespace detail { + +namespace d1 { class task_group_context; class wait_context; struct execution_data; } +namespace d2 { + +class task_handle; + +class task_handle_task : public d1::task { + std::uint64_t m_version_and_traits{}; + d1::wait_context& m_wait_ctx; + d1::task_group_context& m_ctx; + d1::small_object_allocator m_allocator; +public: + void finalize(const d1::execution_data* ed = nullptr) { + if (ed) { + m_allocator.delete_object(this, *ed); + } else { + m_allocator.delete_object(this); + } + } + + task_handle_task(d1::wait_context& wo, d1::task_group_context& ctx, d1::small_object_allocator& alloc) + : m_wait_ctx(wo) + , m_ctx(ctx) + , m_allocator(alloc) { + suppress_unused_warning(m_version_and_traits); + } + + ~task_handle_task() override { + m_wait_ctx.release(); + } + + d1::task_group_context& ctx() const { return m_ctx; } +}; + + +class task_handle { + struct task_handle_task_finalizer_t{ + void operator()(task_handle_task* p){ p->finalize(); } + }; + using handle_impl_t = std::unique_ptr; + + handle_impl_t m_handle = {nullptr}; +public: + task_handle() = default; + task_handle(task_handle&&) = default; + task_handle& operator=(task_handle&&) = default; + + explicit operator bool() const noexcept { return static_cast(m_handle); } + + friend bool operator==(task_handle const& th, std::nullptr_t) noexcept; + friend bool operator==(std::nullptr_t, task_handle const& th) noexcept; + + friend bool operator!=(task_handle const& th, std::nullptr_t) noexcept; + friend bool operator!=(std::nullptr_t, task_handle const& th) noexcept; + +private: + friend struct task_handle_accessor; + + task_handle(task_handle_task* t) : m_handle {t}{}; + + d1::task* release() { + return m_handle.release(); + } +}; + +struct task_handle_accessor { +static task_handle construct(task_handle_task* t) { return {t}; } +static d1::task* release(task_handle& th) { return th.release(); } +static d1::task_group_context& ctx_of(task_handle& th) { + __TBB_ASSERT(th.m_handle, "ctx_of does not expect empty task_handle."); + return th.m_handle->ctx(); +} +}; + +inline bool operator==(task_handle const& th, std::nullptr_t) noexcept { + return th.m_handle == nullptr; +} +inline bool operator==(std::nullptr_t, task_handle const& th) noexcept { + return th.m_handle == nullptr; +} + +inline bool operator!=(task_handle const& th, std::nullptr_t) noexcept { + return th.m_handle != nullptr; +} + +inline bool operator!=(std::nullptr_t, task_handle const& th) noexcept { + return th.m_handle != nullptr; +} + +} // namespace d2 +} // namespace detail +} // namespace tbb + +#endif /* __TBB_task_handle_H */ diff --git a/third_party/tbb/detail/_template_helpers.h b/third_party/tbb/detail/_template_helpers.h new file mode 100644 index 000000000..e27ff363e --- /dev/null +++ b/third_party/tbb/detail/_template_helpers.h @@ -0,0 +1,404 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__template_helpers_H +#define __TBB_detail__template_helpers_H + +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_config.h" + +#include "third_party/libcxx/cstddef" +#include "third_party/libcxx/cstdint" +#include "third_party/libcxx/utility" +#include "third_party/libcxx/type_traits" +#include "third_party/libcxx/memory" +#include "third_party/libcxx/iterator" + +namespace tbb { +namespace detail { +inline namespace d0 { + +// An internal implementation of void_t, which can be used in SFINAE contexts +template +struct void_impl { + using type = void; +}; // struct void_impl + +template +using void_t = typename void_impl::type; + +// Generic SFINAE helper for expression checks, based on the idea demonstrated in ISO C++ paper n4502 +template class... Checks> +struct supports_impl { + using type = std::false_type; +}; + +template class... Checks> +struct supports_impl...>, Checks...> { + using type = std::true_type; +}; + +template class... Checks> +using supports = typename supports_impl::type; + +//! A template to select either 32-bit or 64-bit constant as compile time, depending on machine word size. +template +struct select_size_t_constant { + // Explicit cast is needed to avoid compiler warnings about possible truncation. + // The value of the right size, which is selected by ?:, is anyway not truncated or promoted. + static const std::size_t value = static_cast((sizeof(std::size_t)==sizeof(u)) ? u : ull); +}; + +// TODO: do we really need it? +//! Cast between unrelated pointer types. +/** This method should be used sparingly as a last resort for dealing with + situations that inherently break strict ISO C++ aliasing rules. */ +// T is a pointer type because it will be explicitly provided by the programmer as a template argument; +// U is a referent type to enable the compiler to check that "ptr" is a pointer, deducing U in the process. +template +inline T punned_cast( U* ptr ) { + std::uintptr_t x = reinterpret_cast(ptr); + return reinterpret_cast(x); +} + +template +struct padded_base : T { + char pad[S - R]; +}; +template struct padded_base : T {}; + +//! Pads type T to fill out to a multiple of cache line size. +template +struct padded : padded_base {}; + +#if __TBB_CPP14_INTEGER_SEQUENCE_PRESENT + +using std::index_sequence; +using std::make_index_sequence; + +#else + +template class index_sequence {}; + +template +struct make_index_sequence_impl : make_index_sequence_impl < N - 1, N - 1, S... > {}; + +template +struct make_index_sequence_impl <0, S...> { + using type = index_sequence; +}; + +template +using make_index_sequence = typename make_index_sequence_impl::type; + +#endif /* __TBB_CPP14_INTEGER_SEQUENCE_PRESENT */ + +#if __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT +using std::conjunction; +using std::disjunction; +#else // __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT + +template +struct conjunction : std::true_type {}; + +template +struct conjunction + : std::conditional, First>::type {}; + +template +struct conjunction : T {}; + +template +struct disjunction : std::false_type {}; + +template +struct disjunction + : std::conditional>::type {}; + +template +struct disjunction : T {}; + +#endif // __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT + +template +using iterator_value_t = typename std::iterator_traits::value_type; + +template +using iterator_key_t = typename std::remove_const::first_type>::type; + +template +using iterator_mapped_t = typename iterator_value_t::second_type; + +template +using iterator_alloc_pair_t = std::pair>::type, + iterator_mapped_t>; + +template using alloc_value_type = typename A::value_type; +template using alloc_ptr_t = typename std::allocator_traits::pointer; +template using has_allocate = decltype(std::declval&>() = std::declval().allocate(0)); +template using has_deallocate = decltype(std::declval().deallocate(std::declval>(), 0)); + +// alloc_value_type should be checked first, because it can be used in other checks +template +using is_allocator = supports; + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT +template +inline constexpr bool is_allocator_v = is_allocator::value; +#endif + +// Template class in which the "type" determines the type of the element number N in pack Args +template +struct pack_element { + using type = void; +}; + +template +struct pack_element { + using type = typename pack_element::type; +}; + +template +struct pack_element<0, T, Args...> { + using type = T; +}; + +template +using pack_element_t = typename pack_element::type; + +template +class raii_guard { +public: + static_assert( + std::is_nothrow_copy_constructible::value && + std::is_nothrow_move_constructible::value, + "Throwing an exception during the Func copy or move construction cause an unexpected behavior." + ); + + raii_guard( Func f ) noexcept : my_func(f), is_active(true) {} + + raii_guard( raii_guard&& g ) noexcept : my_func(std::move(g.my_func)), is_active(g.is_active) { + g.is_active = false; + } + + ~raii_guard() { + if (is_active) { + my_func(); + } + } + + void dismiss() { + is_active = false; + } +private: + Func my_func; + bool is_active; +}; // class raii_guard + +template +raii_guard make_raii_guard( Func f ) { + return raii_guard(f); +} + +template +struct try_call_proxy { + try_call_proxy( Body b ) : body(b) {} + + template + void on_exception( OnExceptionBody on_exception_body ) { + auto guard = make_raii_guard(on_exception_body); + body(); + guard.dismiss(); + } + + template + void on_completion(OnCompletionBody on_completion_body) { + auto guard = make_raii_guard(on_completion_body); + body(); + } + + Body body; +}; // struct try_call_proxy + +// Template helper function for API +// try_call(lambda1).on_exception(lambda2) +// Executes lambda1 and if it throws an exception - executes lambda2 +template +try_call_proxy try_call( Body b ) { + return try_call_proxy(b); +} + +#if __TBB_CPP17_IS_SWAPPABLE_PRESENT +using std::is_nothrow_swappable; +using std::is_swappable; +#else // __TBB_CPP17_IS_SWAPPABLE_PRESENT +namespace is_swappable_detail { +using std::swap; + +template +using has_swap = decltype(swap(std::declval(), std::declval())); + +#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER +// Workaround for VS2015: it fails to instantiate noexcept(...) inside std::integral_constant. +template +struct noexcept_wrapper { + static const bool value = noexcept(swap(std::declval(), std::declval())); +}; +template +struct is_nothrow_swappable_impl : std::integral_constant::value> {}; +#else +template +struct is_nothrow_swappable_impl : std::integral_constant(), std::declval()))> {}; +#endif +} + +template +struct is_swappable : supports {}; + +template +struct is_nothrow_swappable + : conjunction, is_swappable_detail::is_nothrow_swappable_impl> {}; +#endif // __TBB_CPP17_IS_SWAPPABLE_PRESENT + +//! Allows to store a function parameter pack as a variable and later pass it to another function +template< typename... Types > +struct stored_pack; + +template<> +struct stored_pack<> +{ + using pack_type = stored_pack<>; + stored_pack() {} + + // Friend front-end functions + template< typename F, typename Pack > friend void call(F&& f, Pack&& p); + template< typename Ret, typename F, typename Pack > friend Ret call_and_return(F&& f, Pack&& p); + +protected: + // Ideally, ref-qualified non-static methods would be used, + // but that would greatly reduce the set of compilers where it works. + template< typename Ret, typename F, typename... Preceding > + static Ret call(F&& f, const pack_type& /*pack*/, Preceding&&... params) { + return std::forward(f)(std::forward(params)...); + } + template< typename Ret, typename F, typename... Preceding > + static Ret call(F&& f, pack_type&& /*pack*/, Preceding&&... params) { + return std::forward(f)(std::forward(params)...); + } +}; + +template< typename T, typename... Types > +struct stored_pack : stored_pack +{ + using pack_type = stored_pack; + using pack_remainder = stored_pack; + + // Since lifetime of original values is out of control, copies should be made. + // Thus references should be stripped away from the deduced type. + typename std::decay::type leftmost_value; + + // Here rvalue references act in the same way as forwarding references, + // as long as class template parameters were deduced via forwarding references. + stored_pack(T&& t, Types&&... types) + : pack_remainder(std::forward(types)...), leftmost_value(std::forward(t)) {} + + // Friend front-end functions + template< typename F, typename Pack > friend void call(F&& f, Pack&& p); + template< typename Ret, typename F, typename Pack > friend Ret call_and_return(F&& f, Pack&& p); + +protected: + template< typename Ret, typename F, typename... Preceding > + static Ret call(F&& f, pack_type& pack, Preceding&&... params) { + return pack_remainder::template call( + std::forward(f), static_cast(pack), + std::forward(params)... , pack.leftmost_value + ); + } + + template< typename Ret, typename F, typename... Preceding > + static Ret call(F&& f, pack_type&& pack, Preceding&&... params) { + return pack_remainder::template call( + std::forward(f), static_cast(pack), + std::forward(params)... , std::move(pack.leftmost_value) + ); + } +}; + +//! Calls the given function with arguments taken from a stored_pack +template< typename F, typename Pack > +void call(F&& f, Pack&& p) { + std::decay::type::template call(std::forward(f), std::forward(p)); +} + +template< typename Ret, typename F, typename Pack > +Ret call_and_return(F&& f, Pack&& p) { + return std::decay::type::template call(std::forward(f), std::forward(p)); +} + +template< typename... Types > +stored_pack save_pack(Types&&... types) { + return stored_pack(std::forward(types)...); +} + +// A structure with the value which is equal to Trait::value +// but can be used in the immediate context due to parameter T +template +struct dependent_bool : std::integral_constant {}; + +template +struct body_arg_detector; + +template +struct body_arg_detector { + using arg_type = Arg; +}; + +template +struct body_arg_detector { + using arg_type = Arg; +}; + +template +struct argument_detector; + +template +struct argument_detector { + using type = typename body_arg_detector::arg_type; +}; + +template +struct argument_detector { + using type = Arg; +}; + +// Detects the argument type of callable, works for callable with one argument. +template +using argument_type_of = typename argument_detector::type>::type; + +template +struct type_identity { + using type = T; +}; + +template +using type_identity_t = typename type_identity::type; + +} // inline namespace d0 +} // namespace detail +} // namespace tbb + +#endif // __TBB_detail__template_helpers_H diff --git a/third_party/tbb/detail/_utils.h b/third_party/tbb/detail/_utils.h new file mode 100644 index 000000000..09fb02561 --- /dev/null +++ b/third_party/tbb/detail/_utils.h @@ -0,0 +1,394 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__utils_H +#define __TBB_detail__utils_H + +#include "third_party/libcxx/type_traits" +#include "third_party/libcxx/cstdint" +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/functional" + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_machine.h" + +namespace tbb { +namespace detail { +inline namespace d0 { + +//! Utility template function to prevent "unused" warnings by various compilers. +template void suppress_unused_warning(T&&...) {} + +//! Compile-time constant that is upper bound on cache line/sector size. +/** It should be used only in situations where having a compile-time upper + bound is more useful than a run-time exact answer. + @ingroup memory_allocation */ +constexpr size_t max_nfs_size = 128; +constexpr std::size_t max_nfs_size_exp = 7; +static_assert(1 << max_nfs_size_exp == max_nfs_size, "max_nfs_size_exp must be a log2(max_nfs_size)"); + +//! Class that implements exponential backoff. +class atomic_backoff { + //! Time delay, in units of "pause" instructions. + /** Should be equal to approximately the number of "pause" instructions + that take the same time as an context switch. Must be a power of two.*/ + static constexpr std::int32_t LOOPS_BEFORE_YIELD = 16; + std::int32_t count; + +public: + // In many cases, an object of this type is initialized eagerly on hot path, + // as in for(atomic_backoff b; ; b.pause()) { /*loop body*/ } + // For this reason, the construction cost must be very small! + atomic_backoff() : count(1) {} + // This constructor pauses immediately; do not use on hot paths! + atomic_backoff(bool) : count(1) { pause(); } + + //! No Copy + atomic_backoff(const atomic_backoff&) = delete; + atomic_backoff& operator=(const atomic_backoff&) = delete; + + //! Pause for a while. + void pause() { + if (count <= LOOPS_BEFORE_YIELD) { + machine_pause(count); + // Pause twice as long the next time. + count *= 2; + } else { + // Pause is so long that we might as well yield CPU to scheduler. + yield(); + } + } + + //! Pause for a few times and return false if saturated. + bool bounded_pause() { + machine_pause(count); + if (count < LOOPS_BEFORE_YIELD) { + // Pause twice as long the next time. + count *= 2; + return true; + } else { + return false; + } + } + + void reset() { + count = 1; + } +}; + +//! Spin WHILE the condition is true. +/** T and U should be comparable types. */ +template +T spin_wait_while(const std::atomic& location, C comp, std::memory_order order) { + atomic_backoff backoff; + T snapshot = location.load(order); + while (comp(snapshot)) { + backoff.pause(); + snapshot = location.load(order); + } + return snapshot; +} + +//! Spin WHILE the value of the variable is equal to a given value +/** T and U should be comparable types. */ +template +T spin_wait_while_eq(const std::atomic& location, const U value, std::memory_order order = std::memory_order_acquire) { + return spin_wait_while(location, [&value](T t) { return t == value; }, order); +} + +//! Spin UNTIL the value of the variable is equal to a given value +/** T and U should be comparable types. */ +template +T spin_wait_until_eq(const std::atomic& location, const U value, std::memory_order order = std::memory_order_acquire) { + return spin_wait_while(location, [&value](T t) { return t != value; }, order); +} + +//! Spin UNTIL the condition returns true or spinning time is up. +/** Returns what the passed functor returned last time it was invoked. */ +template +bool timed_spin_wait_until(Condition condition) { + // 32 pauses + 32 yields are meausered as balanced spin time before sleep. + bool finish = condition(); + for (int i = 1; !finish && i < 32; finish = condition(), i *= 2) { + machine_pause(i); + } + for (int i = 32; !finish && i < 64; finish = condition(), ++i) { + yield(); + } + return finish; +} + +template +T clamp(T value, T lower_bound, T upper_bound) { + __TBB_ASSERT(lower_bound <= upper_bound, "Incorrect bounds"); + return value > lower_bound ? (value > upper_bound ? upper_bound : value) : lower_bound; +} + +template +std::uintptr_t log2(T in) { + __TBB_ASSERT(in > 0, "The logarithm of a non-positive value is undefined."); + return machine_log2(in); +} + +template +T reverse_bits(T src) { + return machine_reverse_bits(src); +} + +template +T reverse_n_bits(T src, std::size_t n) { + __TBB_ASSERT(n != 0, "Reverse for 0 bits is undefined behavior."); + return reverse_bits(src) >> (number_of_bits() - n); +} + +// A function to check if passed integer is a power of two +template +constexpr bool is_power_of_two( IntegerType arg ) { + static_assert(std::is_integral::value, + "An argument for is_power_of_two should be integral type"); + return arg && (0 == (arg & (arg - 1))); +} + +// A function to determine if passed integer is a power of two +// at least as big as another power of two, i.e. for strictly positive i and j, +// with j being a power of two, determines whether i==j< +constexpr bool is_power_of_two_at_least(ArgIntegerType arg, DivisorIntegerType divisor) { + // Divisor should be a power of two + static_assert(std::is_integral::value, + "An argument for is_power_of_two_at_least should be integral type"); + return 0 == (arg & (arg - divisor)); +} + +// A function to compute arg modulo divisor where divisor is a power of 2. +template +inline ArgIntegerType modulo_power_of_two(ArgIntegerType arg, DivisorIntegerType divisor) { + __TBB_ASSERT( is_power_of_two(divisor), "Divisor should be a power of two" ); + return arg & (divisor - 1); +} + +//! A function to check if passed in pointer is aligned on a specific border +template +constexpr bool is_aligned(T* pointer, std::uintptr_t alignment) { + return 0 == (reinterpret_cast(pointer) & (alignment - 1)); +} + +#if TBB_USE_ASSERT +static void* const poisoned_ptr = reinterpret_cast(-1); + +//! Set p to invalid pointer value. +template +inline void poison_pointer( T* &p ) { p = reinterpret_cast(poisoned_ptr); } + +template +inline void poison_pointer(std::atomic& p) { p.store(reinterpret_cast(poisoned_ptr), std::memory_order_relaxed); } + +/** Expected to be used in assertions only, thus no empty form is defined. **/ +template +inline bool is_poisoned( T* p ) { return p == reinterpret_cast(poisoned_ptr); } + +template +inline bool is_poisoned(const std::atomic& p) { return is_poisoned(p.load(std::memory_order_relaxed)); } +#else +template +inline void poison_pointer(T&) {/*do nothing*/} +#endif /* !TBB_USE_ASSERT */ + +template +bool assert_pointer_valid(T* p, const char* comment = nullptr) { + suppress_unused_warning(p, comment); + __TBB_ASSERT(p != nullptr, comment); + __TBB_ASSERT(!is_poisoned(p), comment); +#if !(_MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER) + __TBB_ASSERT(is_aligned(p, alignment == 0 ? alignof(T) : alignment), comment); +#endif + // Returns something to simplify assert_pointers_valid implementation. + return true; +} + +template +void assert_pointers_valid(Args*... p) { + // suppress_unused_warning is used as an evaluation context for the variadic pack. + suppress_unused_warning(assert_pointer_valid(p)...); +} + +//! Base class for types that should not be assigned. +class no_assign { +public: + void operator=(const no_assign&) = delete; + no_assign(const no_assign&) = default; + no_assign() = default; +}; + +//! Base class for types that should not be copied or assigned. +class no_copy: no_assign { +public: + no_copy(const no_copy&) = delete; + no_copy() = default; +}; + +template +void swap_atomics_relaxed(std::atomic& lhs, std::atomic& rhs){ + T tmp = lhs.load(std::memory_order_relaxed); + lhs.store(rhs.load(std::memory_order_relaxed), std::memory_order_relaxed); + rhs.store(tmp, std::memory_order_relaxed); +} + +//! One-time initialization states +enum class do_once_state { + uninitialized = 0, ///< No execution attempts have been undertaken yet + pending, ///< A thread is executing associated do-once routine + executed, ///< Do-once routine has been executed + initialized = executed ///< Convenience alias +}; + +//! One-time initialization function +/** /param initializer Pointer to function without arguments + The variant that returns bool is used for cases when initialization can fail + and it is OK to continue execution, but the state should be reset so that + the initialization attempt was repeated the next time. + /param state Shared state associated with initializer that specifies its + initialization state. Must be initially set to #uninitialized value + (e.g. by means of default static zero initialization). **/ +template +void atomic_do_once( const F& initializer, std::atomic& state ) { + // The loop in the implementation is necessary to avoid race when thread T2 + // that arrived in the middle of initialization attempt by another thread T1 + // has just made initialization possible. + // In such a case T2 has to rely on T1 to initialize, but T1 may already be past + // the point where it can recognize the changed conditions. + do_once_state expected_state; + while ( state.load( std::memory_order_acquire ) != do_once_state::executed ) { + if( state.load( std::memory_order_relaxed ) == do_once_state::uninitialized ) { + expected_state = do_once_state::uninitialized; +#if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910 + using enum_type = typename std::underlying_type::type; + if( ((std::atomic&)state).compare_exchange_strong( (enum_type&)expected_state, (enum_type)do_once_state::pending ) ) { +#else + if( state.compare_exchange_strong( expected_state, do_once_state::pending ) ) { +#endif + run_initializer( initializer, state ); + break; + } + } + spin_wait_while_eq( state, do_once_state::pending ); + } +} + +// Run the initializer which can not fail +template +void run_initializer(const Functor& f, std::atomic& state ) { + f(); + state.store(do_once_state::executed, std::memory_order_release); +} + +#if __TBB_CPP20_CONCEPTS_PRESENT +template +concept boolean_testable_impl = std::convertible_to; + +template +concept boolean_testable = boolean_testable_impl && requires( T&& t ) { + { !std::forward(t) } -> boolean_testable_impl; + }; + +#if __TBB_CPP20_COMPARISONS_PRESENT +struct synthesized_three_way_comparator { + template + auto operator()( const T1& lhs, const T2& rhs ) const + requires requires { + { lhs < rhs } -> boolean_testable; + { rhs < lhs } -> boolean_testable; + } + { + if constexpr (std::three_way_comparable_with) { + return lhs <=> rhs; + } else { + if (lhs < rhs) { + return std::weak_ordering::less; + } + if (rhs < lhs) { + return std::weak_ordering::greater; + } + return std::weak_ordering::equivalent; + } + } +}; // struct synthesized_three_way_comparator + +template +using synthesized_three_way_result = decltype(synthesized_three_way_comparator{}(std::declval(), + std::declval())); + +#endif // __TBB_CPP20_COMPARISONS_PRESENT + +// Check if the type T is implicitly OR explicitly convertible to U +template +concept relaxed_convertible_to = std::constructible_from; + +template +concept adaptive_same_as = +#if __TBB_STRICT_CONSTRAINTS + std::same_as; +#else + std::convertible_to; +#endif +#endif // __TBB_CPP20_CONCEPTS_PRESENT + +template +auto invoke(F&& f, Args&&... args) +#if __TBB_CPP17_INVOKE_PRESENT + noexcept(std::is_nothrow_invocable_v) + -> std::invoke_result_t +{ + return std::invoke(std::forward(f), std::forward(args)...); +} +#else // __TBB_CPP17_INVOKE_PRESENT + noexcept(noexcept(std::forward(f)(std::forward(args)...))) + -> decltype(std::forward(f)(std::forward(args)...)) +{ + return std::forward(f)(std::forward(args)...); +} +#endif // __TBB_CPP17_INVOKE_PRESENT + +} // namespace d0 + +namespace d1 { + +class delegate_base { +public: + virtual bool operator()() const = 0; + virtual ~delegate_base() {} +}; + +template +class delegated_function : public delegate_base { +public: + delegated_function(FuncType& f) : my_func(f) {} + + bool operator()() const override { + return my_func(); + } + +private: + FuncType &my_func; +}; +} // namespace d1 + +} // namespace detail +} // namespace tbb + +#endif // __TBB_detail__utils_H diff --git a/third_party/tbb/detail/_waitable_atomic.h b/third_party/tbb/detail/_waitable_atomic.h new file mode 100644 index 000000000..992f9a112 --- /dev/null +++ b/third_party/tbb/detail/_waitable_atomic.h @@ -0,0 +1,105 @@ +// clang-format off +/* + Copyright (c) 2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_detail__address_waiters_H +#define __TBB_detail__address_waiters_H + +#include "third_party/tbb/detail/_utils.h" + +namespace tbb { +namespace detail { + +namespace r1 { +TBB_EXPORT void __TBB_EXPORTED_FUNC wait_on_address(void* address, d1::delegate_base& wakeup_condition, std::uintptr_t context); +TBB_EXPORT void __TBB_EXPORTED_FUNC notify_by_address(void* address, std::uintptr_t context); +TBB_EXPORT void __TBB_EXPORTED_FUNC notify_by_address_one(void* address); +TBB_EXPORT void __TBB_EXPORTED_FUNC notify_by_address_all(void* address); +} // namespace r1 + +namespace d1 { + +template +void adaptive_wait_on_address(void* address, Predicate wakeup_condition, std::uintptr_t context) { + if (!timed_spin_wait_until(wakeup_condition)) { + d1::delegated_function pred(wakeup_condition); + r1::wait_on_address(address, pred, context); + } +} + +template +class waitable_atomic { +public: + waitable_atomic() = default; + + explicit waitable_atomic(T value) : my_atomic(value) {} + + waitable_atomic(const waitable_atomic&) = delete; + waitable_atomic& operator=(const waitable_atomic&) = delete; + + T load(std::memory_order order) const noexcept { + return my_atomic.load(order); + } + + T exchange(T desired) noexcept { + return my_atomic.exchange(desired); + } + + void wait(T old, std::uintptr_t context, std::memory_order order) { + auto wakeup_condition = [&] { return my_atomic.load(order) != old; }; + if (!timed_spin_wait_until(wakeup_condition)) { + // We need to use while here, because notify_all() will wake up all threads + // But predicate for them might be false + d1::delegated_function pred(wakeup_condition); + do { + r1::wait_on_address(this, pred, context); + } while (!wakeup_condition()); + } + } + + void wait_until(T expected, std::uintptr_t context, std::memory_order order) { + auto wakeup_condition = [&] { return my_atomic.load(order) == expected; }; + if (!timed_spin_wait_until(wakeup_condition)) { + // We need to use while here, because notify_all() will wake up all threads + // But predicate for them might be false + d1::delegated_function pred(wakeup_condition); + do { + r1::wait_on_address(this, pred, context); + } while (!wakeup_condition()); + } + } + + void notify_relaxed(std::uintptr_t context) { + r1::notify_by_address(this, context); + } + + void notify_one_relaxed() { + r1::notify_by_address_one(this); + } + + // TODO: consider adding following interfaces: + // store(desired, memory_order) + // notify_all_relaxed() + +private: + std::atomic my_atomic{}; +}; + +} // namespace d1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_detail__address_waiters_H diff --git a/third_party/tbb/dynamic_link.cpp b/third_party/tbb/dynamic_link.cpp new file mode 100644 index 000000000..c20e88f2c --- /dev/null +++ b/third_party/tbb/dynamic_link.cpp @@ -0,0 +1,516 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/dynamic_link.h" +#include "third_party/tbb/environment.h" + +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_utils.h" + +/* + This file is used by both TBB and OpenMP RTL. Do not use __TBB_ASSERT() macro + and runtime_warning() function because they are not available in OpenMP. Use + __TBB_ASSERT_EX and DYNAMIC_LINK_WARNING instead. +*/ + +#include "third_party/libcxx/cstdarg" // va_list etc. +#include "third_party/libcxx/cstring" // strrchr +#if _WIN32 + #include "libc/mem/mem.h" + + // Unify system calls + #define dlopen( name, flags ) LoadLibrary( name ) + #define dlsym( handle, name ) GetProcAddress( handle, name ) + #define dlclose( handle ) ( ! FreeLibrary( handle ) ) + #define dlerror() GetLastError() +#ifndef PATH_MAX + #define PATH_MAX MAX_PATH +#endif +#else /* _WIN32 */ + #include "libc/runtime/dlfcn.h" + #include "libc/calls/calls.h" +#include "libc/calls/weirdtypes.h" +#include "libc/runtime/pathconf.h" +#include "libc/runtime/runtime.h" +#include "libc/runtime/sysconf.h" +#include "libc/sysv/consts/f.h" +#include "libc/sysv/consts/fileno.h" +#include "libc/sysv/consts/o.h" +#include "libc/sysv/consts/ok.h" +#include "libc/time/time.h" +#include "third_party/getopt/getopt.h" +#include "third_party/musl/crypt.h" +#include "third_party/musl/lockf.h" + + #include "third_party/libcxx/climits" + #include "third_party/libcxx/cstdlib" +#endif /* _WIN32 */ + +#if __TBB_WEAK_SYMBOLS_PRESENT && !__TBB_DYNAMIC_LOAD_ENABLED + //TODO: use function attribute for weak symbols instead of the pragma. + #pragma weak dlopen + #pragma weak dlsym + #pragma weak dlclose +#endif /* __TBB_WEAK_SYMBOLS_PRESENT && !__TBB_DYNAMIC_LOAD_ENABLED */ + + +#define __USE_STATIC_DL_INIT ( !__ANDROID__ ) + + +/* +dynamic_link is a common interface for searching for required symbols in an +executable and dynamic libraries. + +dynamic_link provides certain guarantees: + 1. Either all or none of the requested symbols are resolved. Moreover, if + symbols are not resolved, the dynamic_link_descriptor table is not modified; + 2. All returned symbols have secured lifetime: this means that none of them + can be invalidated until dynamic_unlink is called; + 3. Any loaded library is loaded only via the full path. The full path is that + from which the runtime itself was loaded. (This is done to avoid security + issues caused by loading libraries from insecure paths). + +dynamic_link searches for the requested symbols in three stages, stopping as +soon as all of the symbols have been resolved. + + 1. Search the global scope: + a. On Windows: dynamic_link tries to obtain the handle of the requested + library and if it succeeds it resolves the symbols via that handle. + b. On Linux: dynamic_link tries to search for the symbols in the global + scope via the main program handle. If the symbols are present in the global + scope their lifetime is not guaranteed (since dynamic_link does not know + anything about the library from which they are exported). Therefore it + tries to "pin" the symbols by obtaining the library name and reopening it. + dlopen may fail to reopen the library in two cases: + i. The symbols are exported from the executable. Currently dynamic _link + cannot handle this situation, so it will not find these symbols in this + step. + ii. The necessary library has been unloaded and cannot be reloaded. It + seems there is nothing that can be done in this case. No symbols are + returned. + + 2. Dynamic load: an attempt is made to load the requested library via the + full path. + The full path used is that from which the runtime itself was loaded. If the + library can be loaded, then an attempt is made to resolve the requested + symbols in the newly loaded library. + If the symbols are not found the library is unloaded. + + 3. Weak symbols: if weak symbols are available they are returned. +*/ + +namespace tbb { +namespace detail { +namespace r1 { + +#if __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED + +#if !defined(DYNAMIC_LINK_WARNING) && !__TBB_WIN8UI_SUPPORT && __TBB_DYNAMIC_LOAD_ENABLED + // Report runtime errors and continue. + #define DYNAMIC_LINK_WARNING dynamic_link_warning + static void dynamic_link_warning( dynamic_link_error_t code, ... ) { + suppress_unused_warning(code); + } // library_warning +#endif /* !defined(DYNAMIC_LINK_WARNING) && !__TBB_WIN8UI_SUPPORT && __TBB_DYNAMIC_LOAD_ENABLED */ + + static bool resolve_symbols( dynamic_link_handle module, const dynamic_link_descriptor descriptors[], std::size_t required ) + { + if ( !module ) + return false; + + #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */ + if ( !dlsym ) return false; + #endif /* !__TBB_DYNAMIC_LOAD_ENABLED */ + + const std::size_t n_desc=20; // Usually we don't have more than 20 descriptors per library + __TBB_ASSERT_EX( required <= n_desc, "Too many descriptors is required" ); + if ( required > n_desc ) return false; + pointer_to_handler h[n_desc]; + + for ( std::size_t k = 0; k < required; ++k ) { + dynamic_link_descriptor const & desc = descriptors[k]; + pointer_to_handler addr = (pointer_to_handler)dlsym( module, desc.name ); + if ( !addr ) { + return false; + } + h[k] = addr; + } + + // Commit the entry points. + // Cannot use memset here, because the writes must be atomic. + for( std::size_t k = 0; k < required; ++k ) + *descriptors[k].handler = h[k]; + return true; + } + +#if __TBB_WIN8UI_SUPPORT + bool dynamic_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required, dynamic_link_handle*, int flags ) { + dynamic_link_handle tmp_handle = nullptr; + TCHAR wlibrary[256]; + if ( MultiByteToWideChar(CP_UTF8, 0, library, -1, wlibrary, 255) == 0 ) return false; + if ( flags & DYNAMIC_LINK_LOAD ) + tmp_handle = LoadPackagedLibrary( wlibrary, 0 ); + if (tmp_handle != nullptr){ + return resolve_symbols(tmp_handle, descriptors, required); + }else{ + return false; + } + } + void dynamic_unlink( dynamic_link_handle ) {} + void dynamic_unlink_all() {} +#else +#if __TBB_DYNAMIC_LOAD_ENABLED +/* + There is a security issue on Windows: LoadLibrary() may load and execute malicious code. + See http://www.microsoft.com/technet/security/advisory/2269637.mspx for details. + To avoid the issue, we have to pass full path (not just library name) to LoadLibrary. This + function constructs full path to the specified library (it is assumed the library located + side-by-side with the tbb.dll. + + The function constructs absolute path for given relative path. Important: Base directory is not + current one, it is the directory tbb.dll loaded from. + + Example: + Let us assume "tbb.dll" is located in "c:\program files\common\intel\" directory, e.g. + absolute path of the library is "c:\program files\common\intel\tbb.dll". Absolute path for + "tbbmalloc.dll" would be "c:\program files\common\intel\tbbmalloc.dll". Absolute path for + "malloc\tbbmalloc.dll" would be "c:\program files\common\intel\malloc\tbbmalloc.dll". +*/ + + // Struct handle_storage is used by dynamic_link routine to store handles of + // all loaded or pinned dynamic libraries. When TBB is shut down, it calls + // dynamic_unlink_all() that unloads modules referenced by handle_storage. + // This struct should not have any constructors since it may be used before + // the constructor is called. + #define MAX_LOADED_MODULES 8 // The number of maximum possible modules which can be loaded + + using atomic_incrementer = std::atomic; + + static struct handles_t { + atomic_incrementer my_size; + dynamic_link_handle my_handles[MAX_LOADED_MODULES]; + + void add(const dynamic_link_handle &handle) { + const std::size_t ind = my_size++; + __TBB_ASSERT_EX( ind < MAX_LOADED_MODULES, "Too many modules are loaded" ); + my_handles[ind] = handle; + } + + void free() { + const std::size_t size = my_size; + for (std::size_t i=0; i( PATH_MAX ) ); + if ( drc == 0 ) { // Error occurred. + int err = GetLastError(); + DYNAMIC_LINK_WARNING( dl_sys_fail, "GetModuleFileName", err ); + return; + } + if ( drc >= PATH_MAX ) { // Buffer too short. + DYNAMIC_LINK_WARNING( dl_buff_too_small ); + return; + } + // Find the position of the last backslash. + char *backslash = std::strrchr( ap_data._path, '\\' ); + + if ( !backslash ) { // Backslash not found. + __TBB_ASSERT_EX( backslash != nullptr, "Unbelievable."); + return; + } + __TBB_ASSERT_EX( backslash >= ap_data._path, "Unbelievable."); + ap_data._len = (std::size_t)(backslash - ap_data._path) + 1; + *(backslash+1) = 0; + #else + // Get the library path + Dl_info dlinfo; + int res = dladdr( (void*)&dynamic_link, &dlinfo ); // any function inside the library can be used for the address + if ( !res ) { + char const * err = dlerror(); + DYNAMIC_LINK_WARNING( dl_sys_fail, "dladdr", err ); + return; + } else { + __TBB_ASSERT_EX( dlinfo.dli_fname!=nullptr, "Unbelievable." ); + } + + char const *slash = std::strrchr( dlinfo.dli_fname, '/' ); + std::size_t fname_len=0; + if ( slash ) { + __TBB_ASSERT_EX( slash >= dlinfo.dli_fname, "Unbelievable."); + fname_len = (std::size_t)(slash - dlinfo.dli_fname) + 1; + } + + std::size_t rc; + if ( dlinfo.dli_fname[0]=='/' ) { + // The library path is absolute + rc = 0; + ap_data._len = 0; + } else { + // The library path is relative so get the current working directory + if ( !getcwd( ap_data._path, sizeof(ap_data._path)/sizeof(ap_data._path[0]) ) ) { + DYNAMIC_LINK_WARNING( dl_buff_too_small ); + return; + } + ap_data._len = std::strlen( ap_data._path ); + ap_data._path[ap_data._len++]='/'; + rc = ap_data._len; + } + + if ( fname_len>0 ) { + ap_data._len += fname_len; + if ( ap_data._len>PATH_MAX ) { + DYNAMIC_LINK_WARNING( dl_buff_too_small ); + ap_data._len=0; + return; + } + std::strncpy( ap_data._path+rc, dlinfo.dli_fname, fname_len ); + ap_data._path[ap_data._len]=0; + } + #endif /* _WIN32 */ + } + + static void init_dl_data() { + init_ap_data(); + } + + /* + The function constructs absolute path for given relative path. Important: Base directory is not + current one, it is the directory libtbb.so loaded from. + + Arguments: + in name -- Name of a file (may be with relative path; it must not be an absolute one). + out path -- Buffer to save result (absolute path) to. + in len -- Size of buffer. + ret -- 0 -- Error occurred. + > len -- Buffer too short, required size returned. + otherwise -- Ok, number of characters (incl. terminating null) written to buffer. + */ + static std::size_t abs_path( char const * name, char * path, std::size_t len ) { + if ( ap_data._len == 0 ) + return 0; + + std::size_t name_len = std::strlen( name ); + std::size_t full_len = name_len+ap_data._len; + if ( full_len < len ) { + __TBB_ASSERT( ap_data._path[ap_data._len] == 0, nullptr); + __TBB_ASSERT( std::strlen(ap_data._path) == ap_data._len, nullptr); + std::strncpy( path, ap_data._path, ap_data._len + 1 ); + __TBB_ASSERT( path[ap_data._len] == 0, nullptr); + std::strncat( path, name, len - ap_data._len ); + __TBB_ASSERT( std::strlen(path) == full_len, nullptr); + } + return full_len+1; // +1 for null character + } +#endif // __TBB_DYNAMIC_LOAD_ENABLED + void init_dynamic_link_data() { + #if __TBB_DYNAMIC_LOAD_ENABLED + std::call_once( init_dl_data_state, init_dl_data ); + #endif + } + + #if __USE_STATIC_DL_INIT + // ap_data structure is initialized with current directory on Linux. + // So it should be initialized as soon as possible since the current directory may be changed. + // static_init_ap_data object provides this initialization during library loading. + static struct static_init_dl_data_t { + static_init_dl_data_t() { + init_dynamic_link_data(); + } + } static_init_dl_data; + #endif + + #if __TBB_WEAK_SYMBOLS_PRESENT + static bool weak_symbol_link( const dynamic_link_descriptor descriptors[], std::size_t required ) + { + // Check if the required entries are present in what was loaded into our process. + for ( std::size_t k = 0; k < required; ++k ) + if ( !descriptors[k].ptr ) + return false; + // Commit the entry points. + for ( std::size_t k = 0; k < required; ++k ) + *descriptors[k].handler = (pointer_to_handler) descriptors[k].ptr; + return true; + } + #else + static bool weak_symbol_link( const dynamic_link_descriptor[], std::size_t ) { + return false; + } + #endif /* __TBB_WEAK_SYMBOLS_PRESENT */ + + void dynamic_unlink( dynamic_link_handle handle ) { + #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */ + if ( !dlclose ) return; + #endif + if ( handle ) { + dlclose( handle ); + } + } + + void dynamic_unlink_all() { + #if __TBB_DYNAMIC_LOAD_ENABLED + handles.free(); + #endif + } + + static dynamic_link_handle global_symbols_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required ) { + dynamic_link_handle library_handle{}; +#if _WIN32 + auto res = GetModuleHandleEx(0, library, &library_handle); + __TBB_ASSERT_EX((res && library_handle) || (!res && !library_handle), nullptr); +#else /* _WIN32 */ + #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */ + if ( !dlopen ) return 0; + #endif /* !__TBB_DYNAMIC_LOAD_ENABLED */ + // RTLD_GLOBAL - to guarantee that old TBB will find the loaded library + // RTLD_NOLOAD - not to load the library without the full path + library_handle = dlopen(library, RTLD_LAZY | RTLD_GLOBAL | RTLD_NOLOAD); +#endif /* _WIN32 */ + if (library_handle) { + if (!resolve_symbols(library_handle, descriptors, required)) { + dynamic_unlink(library_handle); + library_handle = nullptr; + } + } + return library_handle; + } + + static void save_library_handle( dynamic_link_handle src, dynamic_link_handle *dst ) { + __TBB_ASSERT_EX( src, "The library handle to store must be non-zero" ); + if ( dst ) + *dst = src; + #if __TBB_DYNAMIC_LOAD_ENABLED + else + handles.add( src ); + #endif /* __TBB_DYNAMIC_LOAD_ENABLED */ + } + +#if !_WIN32 + int loading_flags(bool local_binding) { + int flags = RTLD_NOW; + if (local_binding) { + flags = flags | RTLD_LOCAL; +#if (__linux__ && __GLIBC__) && !__TBB_USE_SANITIZERS + if( !GetBoolEnvironmentVariable("TBB_ENABLE_SANITIZERS") ) { + flags = flags | RTLD_DEEPBIND; + } +#endif + } else { + flags = flags | RTLD_GLOBAL; + } + return flags; + } +#endif + + dynamic_link_handle dynamic_load( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required, bool local_binding ) { + ::tbb::detail::suppress_unused_warning( library, descriptors, required, local_binding ); +#if __TBB_DYNAMIC_LOAD_ENABLED + std::size_t const len = PATH_MAX + 1; + char path[ len ]; + std::size_t rc = abs_path( library, path, len ); + if ( 0 < rc && rc <= len ) { +#if _WIN32 + // Prevent Windows from displaying silly message boxes if it fails to load library + // (e.g. because of MS runtime problems - one of those crazy manifest related ones) + UINT prev_mode = SetErrorMode (SEM_FAILCRITICALERRORS); +#endif /* _WIN32 */ + // The second argument (loading_flags) is ignored on Windows + dynamic_link_handle library_handle = dlopen( path, loading_flags(local_binding) ); +#if _WIN32 + SetErrorMode (prev_mode); +#endif /* _WIN32 */ + if( library_handle ) { + if( !resolve_symbols( library_handle, descriptors, required ) ) { + // The loaded library does not contain all the expected entry points + dynamic_unlink( library_handle ); + library_handle = nullptr; + } + } else + DYNAMIC_LINK_WARNING( dl_lib_not_found, path, dlerror() ); + return library_handle; + } else if ( rc>len ) + DYNAMIC_LINK_WARNING( dl_buff_too_small ); + // rc == 0 means failing of init_ap_data so the warning has already been issued. + +#endif /* __TBB_DYNAMIC_LOAD_ENABLED */ + return nullptr; + } + + bool dynamic_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required, dynamic_link_handle *handle, int flags ) { + init_dynamic_link_data(); + + // TODO: May global_symbols_link find weak symbols? + dynamic_link_handle library_handle = ( flags & DYNAMIC_LINK_GLOBAL ) ? global_symbols_link( library, descriptors, required ) : nullptr; + +#if defined(_MSC_VER) && _MSC_VER <= 1900 +#pragma warning (push) +// MSVC 2015 warning: 'int': forcing value to bool 'true' or 'false' +#pragma warning (disable: 4800) +#endif + if ( !library_handle && ( flags & DYNAMIC_LINK_LOAD ) ) + library_handle = dynamic_load( library, descriptors, required, flags & DYNAMIC_LINK_LOCAL ); + +#if defined(_MSC_VER) && _MSC_VER <= 1900 +#pragma warning (pop) +#endif + if ( !library_handle && ( flags & DYNAMIC_LINK_WEAK ) ) + return weak_symbol_link( descriptors, required ); + + if ( library_handle ) { + save_library_handle( library_handle, handle ); + return true; + } + return false; + } + +#endif /*__TBB_WIN8UI_SUPPORT*/ +#else /* __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED */ + bool dynamic_link( const char*, const dynamic_link_descriptor*, std::size_t, dynamic_link_handle *handle, int ) { + if ( handle ) + *handle=0; + return false; + } + void dynamic_unlink( dynamic_link_handle ) {} + void dynamic_unlink_all() {} +#endif /* __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED */ + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/dynamic_link.h b/third_party/tbb/dynamic_link.h new file mode 100644 index 000000000..a7af0072c --- /dev/null +++ b/third_party/tbb/dynamic_link.h @@ -0,0 +1,137 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_dynamic_link +#define __TBB_dynamic_link + +// Support for dynamic loading entry points from other shared libraries. + +#include "third_party/tbb/detail/_config.h" + +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/mutex" + +/** By default, symbols declared and defined here go into namespace tbb::internal. + To put them in other namespace, define macros OPEN_INTERNAL_NAMESPACE + and CLOSE_INTERNAL_NAMESPACE to override the following default definitions. **/ + +#include "third_party/libcxx/cstddef" +#ifdef _WIN32 +#include "libc/nt/accounting.h" +#include "libc/nt/automation.h" +#include "libc/nt/console.h" +#include "libc/nt/debug.h" +#include "libc/nt/dll.h" +#include "libc/nt/enum/keyaccess.h" +#include "libc/nt/enum/regtype.h" +#include "libc/nt/errors.h" +#include "libc/nt/events.h" +#include "libc/nt/files.h" +#include "libc/nt/ipc.h" +#include "libc/nt/memory.h" +#include "libc/nt/paint.h" +#include "libc/nt/process.h" +#include "libc/nt/registry.h" +#include "libc/nt/synchronization.h" +#include "libc/nt/thread.h" +#include "libc/nt/windows.h" +#include "libc/nt/winsock.h" +#endif /* _WIN32 */ + +namespace tbb { +namespace detail { +namespace r1 { + +//! Type definition for a pointer to a void somefunc(void) +typedef void (*pointer_to_handler)(); + +//! The helper to construct dynamic_link_descriptor structure +// Double cast through the void* in DLD macro is necessary to +// prevent warnings from some compilers (g++ 4.1) +#if __TBB_WEAK_SYMBOLS_PRESENT +#define DLD(s,h) {#s, (pointer_to_handler*)(void*)(&h), (pointer_to_handler)&s} +#define DLD_NOWEAK(s,h) {#s, (pointer_to_handler*)(void*)(&h), nullptr} +#else +#define DLD(s,h) {#s, (pointer_to_handler*)(void*)(&h)} +#define DLD_NOWEAK(s,h) DLD(s,h) +#endif /* __TBB_WEAK_SYMBOLS_PRESENT */ +//! Association between a handler name and location of pointer to it. +struct dynamic_link_descriptor { + //! Name of the handler + const char* name; + //! Pointer to the handler + pointer_to_handler* handler; +#if __TBB_WEAK_SYMBOLS_PRESENT + //! Weak symbol + pointer_to_handler ptr; +#endif +}; + +#if _WIN32 +using dynamic_link_handle = HMODULE; +#else +using dynamic_link_handle = void*; +#endif /* _WIN32 */ + +const int DYNAMIC_LINK_GLOBAL = 0x01; +const int DYNAMIC_LINK_LOAD = 0x02; +const int DYNAMIC_LINK_WEAK = 0x04; +const int DYNAMIC_LINK_LOCAL = 0x08; + +const int DYNAMIC_LINK_LOCAL_BINDING = DYNAMIC_LINK_LOCAL | DYNAMIC_LINK_LOAD; +const int DYNAMIC_LINK_DEFAULT = DYNAMIC_LINK_GLOBAL | DYNAMIC_LINK_LOAD | DYNAMIC_LINK_WEAK; + +//! Fill in dynamically linked handlers. +/** 'library' is the name of the requested library. It should not contain a full + path since dynamic_link adds the full path (from which the runtime itself + was loaded) to the library name. + 'required' is the number of the initial entries in the array descriptors[] + that have to be found in order for the call to succeed. If the library and + all the required handlers are found, then the corresponding handler + pointers are set, and the return value is true. Otherwise the original + array of descriptors is left untouched and the return value is false. + 'required' is limited by 20 (exceeding of this value will result in failure + to load the symbols and the return value will be false). + 'handle' is the handle of the library if it is loaded. Otherwise it is left + untouched. + 'flags' is the set of DYNAMIC_LINK_* flags. Each of the DYNAMIC_LINK_* flags + allows its corresponding linking stage. +**/ +bool dynamic_link( const char* library, + const dynamic_link_descriptor descriptors[], + std::size_t required, + dynamic_link_handle* handle = nullptr, + int flags = DYNAMIC_LINK_DEFAULT ); + +void dynamic_unlink( dynamic_link_handle handle ); + +void dynamic_unlink_all(); + +enum dynamic_link_error_t { + dl_success = 0, + dl_lib_not_found, // char const * lib, dlerr_t err + dl_sym_not_found, // char const * sym, dlerr_t err + // Note: dlerr_t depends on OS: it is char const * on Linux* and macOS*, int on Windows*. + dl_sys_fail, // char const * func, int err + dl_buff_too_small // none +}; // dynamic_link_error_t + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* __TBB_dynamic_link */ diff --git a/third_party/tbb/enumerable_thread_specific.h b/third_party/tbb/enumerable_thread_specific.h new file mode 100644 index 000000000..0bef0393d --- /dev/null +++ b/third_party/tbb/enumerable_thread_specific.h @@ -0,0 +1,1135 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_enumerable_thread_specific_H +#define __TBB_enumerable_thread_specific_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_aligned_space.h" + +#include "third_party/tbb/concurrent_vector.h" +#include "third_party/tbb/tbb_allocator.h" +#include "third_party/tbb/cache_aligned_allocator.h" +#include "third_party/tbb/profiling.h" + +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/thread" +#include "third_party/libcxx/cstring" // memcpy +#include "third_party/libcxx/cstddef" // std::ptrdiff_t + +#include "third_party/tbb/task.h" // for task::suspend_point + +#if _WIN32 || _WIN64 +#include "libc/nt/accounting.h" +#include "libc/nt/automation.h" +#include "libc/nt/console.h" +#include "libc/nt/debug.h" +#include "libc/nt/dll.h" +#include "libc/nt/enum/keyaccess.h" +#include "libc/nt/enum/regtype.h" +#include "libc/nt/errors.h" +#include "libc/nt/events.h" +#include "libc/nt/files.h" +#include "libc/nt/ipc.h" +#include "libc/nt/memory.h" +#include "libc/nt/paint.h" +#include "libc/nt/process.h" +#include "libc/nt/registry.h" +#include "libc/nt/synchronization.h" +#include "libc/nt/thread.h" +#include "libc/nt/windows.h" +#include "libc/nt/winsock.h" +#else +#include "libc/calls/weirdtypes.h" +#include "libc/sysv/consts/clock.h" +#include "libc/thread/thread.h" +#include "libc/thread/thread2.h" +#endif + +namespace tbb { +namespace detail { +namespace d1 { + +//! enum for selecting between single key and key-per-instance versions +enum ets_key_usage_type { + ets_key_per_instance + , ets_no_key +#if __TBB_RESUMABLE_TASKS + , ets_suspend_aware +#endif +}; + +// Forward declaration to use in internal classes +template +class enumerable_thread_specific; + +template +struct internal_ets_key_selector { + using key_type = std::thread::id; + static key_type current_key() { + return std::this_thread::get_id(); + } +}; + +// Intel Compiler on OSX cannot create atomics objects that instantiated from non-fundamental types +#if __INTEL_COMPILER && __APPLE__ +template<> +struct internal_ets_key_selector { + using key_type = std::size_t; + static key_type current_key() { + auto id = std::this_thread::get_id(); + return reinterpret_cast(id); + } +}; +#endif + +template +struct ets_key_selector : internal_ets_key_selector {}; + +#if __TBB_RESUMABLE_TASKS +template <> +struct ets_key_selector { + using key_type = suspend_point; + static key_type current_key() { + return r1::current_suspend_point(); + } +}; +#endif + +template +class ets_base : detail::no_copy { +protected: + using key_type = typename ets_key_selector::key_type; + +public: + struct slot; + struct array { + array* next; + std::size_t lg_size; + slot& at( std::size_t k ) { + return (reinterpret_cast(reinterpret_cast(this+1)))[k]; + } + std::size_t size() const { return std::size_t(1) << lg_size; } + std::size_t mask() const { return size() - 1; } + std::size_t start( std::size_t h ) const { + return h >> (8 * sizeof(std::size_t) - lg_size); + } + }; + struct slot { + std::atomic key; + void* ptr; + bool empty() const { return key.load(std::memory_order_relaxed) == key_type(); } + bool match( key_type k ) const { return key.load(std::memory_order_relaxed) == k; } + bool claim( key_type k ) { + // TODO: maybe claim ptr, because key_type is not guaranteed to fit into word size + key_type expected = key_type(); + return key.compare_exchange_strong(expected, k); + } + }; + +protected: + //! Root of linked list of arrays of decreasing size. + /** nullptr if and only if my_count==0. + Each array in the list is half the size of its predecessor. */ + std::atomic my_root; + std::atomic my_count; + + virtual void* create_local() = 0; + virtual void* create_array(std::size_t _size) = 0; // _size in bytes + virtual void free_array(void* ptr, std::size_t _size) = 0; // _size in bytes + + array* allocate( std::size_t lg_size ) { + std::size_t n = std::size_t(1) << lg_size; + array* a = static_cast(create_array(sizeof(array) + n * sizeof(slot))); + a->lg_size = lg_size; + std::memset( a + 1, 0, n * sizeof(slot) ); + return a; + } + void deallocate(array* a) { + std::size_t n = std::size_t(1) << (a->lg_size); + free_array( static_cast(a), std::size_t(sizeof(array) + n * sizeof(slot)) ); + } + + ets_base() : my_root{nullptr}, my_count{0} {} + virtual ~ets_base(); // g++ complains if this is not virtual + + void* table_lookup( bool& exists ); + void table_clear(); + // The following functions are not used in concurrent context, + // so we don't need synchronization and ITT annotations there. + template + void table_elementwise_copy( const ets_base& other, + void*(*add_element)(ets_base&, void*) ) { + __TBB_ASSERT(!my_root.load(std::memory_order_relaxed), nullptr); + __TBB_ASSERT(!my_count.load(std::memory_order_relaxed), nullptr); + if( !other.my_root.load(std::memory_order_relaxed) ) return; + array* root = allocate(other.my_root.load(std::memory_order_relaxed)->lg_size); + my_root.store(root, std::memory_order_relaxed); + root->next = nullptr; + my_count.store(other.my_count.load(std::memory_order_relaxed), std::memory_order_relaxed); + std::size_t mask = root->mask(); + for( array* r = other.my_root.load(std::memory_order_relaxed); r; r = r->next ) { + for( std::size_t i = 0; i < r->size(); ++i ) { + slot& s1 = r->at(i); + if( !s1.empty() ) { + for( std::size_t j = root->start(std::hash{}(s1.key.load(std::memory_order_relaxed))); ; j = (j+1)&mask ) { + slot& s2 = root->at(j); + if( s2.empty() ) { + s2.ptr = add_element(static_cast&>(*this), s1.ptr); + s2.key.store(s1.key.load(std::memory_order_relaxed), std::memory_order_relaxed); + break; + } + else if( s2.match(s1.key.load(std::memory_order_relaxed)) ) + break; + } + } + } + } + } + void table_swap( ets_base& other ) { + __TBB_ASSERT(this!=&other, "Don't swap an instance with itself"); + swap_atomics_relaxed(my_root, other.my_root); + swap_atomics_relaxed(my_count, other.my_count); + } +}; + +template +ets_base::~ets_base() { + __TBB_ASSERT(!my_root.load(std::memory_order_relaxed), nullptr); +} + +template +void ets_base::table_clear() { + while ( array* r = my_root.load(std::memory_order_relaxed) ) { + my_root.store(r->next, std::memory_order_relaxed); + deallocate(r); + } + my_count.store(0, std::memory_order_relaxed); +} + +template +void* ets_base::table_lookup( bool& exists ) { + const key_type k = ets_key_selector::current_key(); + + __TBB_ASSERT(k != key_type(), nullptr); + void* found; + std::size_t h = std::hash{}(k); + for( array* r = my_root.load(std::memory_order_acquire); r; r = r->next ) { + call_itt_notify(acquired,r); + std::size_t mask=r->mask(); + for(std::size_t i = r->start(h); ;i=(i+1)&mask) { + slot& s = r->at(i); + if( s.empty() ) break; + if( s.match(k) ) { + if( r == my_root.load(std::memory_order_acquire) ) { + // Success at top level + exists = true; + return s.ptr; + } else { + // Success at some other level. Need to insert at top level. + exists = true; + found = s.ptr; + goto insert; + } + } + } + } + // Key does not yet exist. The density of slots in the table does not exceed 0.5, + // for if this will occur a new table is allocated with double the current table + // size, which is swapped in as the new root table. So an empty slot is guaranteed. + exists = false; + found = create_local(); + { + std::size_t c = ++my_count; + array* r = my_root.load(std::memory_order_acquire); + call_itt_notify(acquired,r); + if( !r || c > r->size()/2 ) { + std::size_t s = r ? r->lg_size : 2; + while( c > std::size_t(1)<<(s-1) ) ++s; + array* a = allocate(s); + for(;;) { + a->next = r; + call_itt_notify(releasing,a); + array* new_r = r; + if( my_root.compare_exchange_strong(new_r, a) ) break; + call_itt_notify(acquired, new_r); + __TBB_ASSERT(new_r != nullptr, nullptr); + if( new_r->lg_size >= s ) { + // Another thread inserted an equal or bigger array, so our array is superfluous. + deallocate(a); + break; + } + r = new_r; + } + } + } + insert: + // Whether a slot has been found in an older table, or if it has been inserted at this level, + // it has already been accounted for in the total. Guaranteed to be room for it, and it is + // not present, so search for empty slot and use it. + array* ir = my_root.load(std::memory_order_acquire); + call_itt_notify(acquired, ir); + std::size_t mask = ir->mask(); + for(std::size_t i = ir->start(h);; i = (i+1)&mask) { + slot& s = ir->at(i); + if( s.empty() ) { + if( s.claim(k) ) { + s.ptr = found; + return found; + } + } + } +} + +//! Specialization that exploits native TLS +template <> +class ets_base: public ets_base { + using super = ets_base; +#if _WIN32||_WIN64 +#if __TBB_WIN8UI_SUPPORT + using tls_key_t = DWORD; + void create_key() { my_key = FlsAlloc(nullptr); } + void destroy_key() { FlsFree(my_key); } + void set_tls(void * value) { FlsSetValue(my_key, (LPVOID)value); } + void* get_tls() { return (void *)FlsGetValue(my_key); } +#else + using tls_key_t = DWORD; + void create_key() { my_key = TlsAlloc(); } + void destroy_key() { TlsFree(my_key); } + void set_tls(void * value) { TlsSetValue(my_key, (LPVOID)value); } + void* get_tls() { return (void *)TlsGetValue(my_key); } +#endif +#else + using tls_key_t = pthread_key_t; + void create_key() { pthread_key_create(&my_key, nullptr); } + void destroy_key() { pthread_key_delete(my_key); } + void set_tls( void * value ) const { pthread_setspecific(my_key, value); } + void* get_tls() const { return pthread_getspecific(my_key); } +#endif + tls_key_t my_key; + virtual void* create_local() override = 0; + virtual void* create_array(std::size_t _size) override = 0; // _size in bytes + virtual void free_array(void* ptr, std::size_t _size) override = 0; // size in bytes +protected: + ets_base() {create_key();} + ~ets_base() {destroy_key();} + void* table_lookup( bool& exists ) { + void* found = get_tls(); + if( found ) { + exists=true; + } else { + found = super::table_lookup(exists); + set_tls(found); + } + return found; + } + void table_clear() { + destroy_key(); + create_key(); + super::table_clear(); + } + void table_swap( ets_base& other ) { + using std::swap; + __TBB_ASSERT(this!=&other, "Don't swap an instance with itself"); + swap(my_key, other.my_key); + super::table_swap(other); + } +}; + +//! Random access iterator for traversing the thread local copies. +template< typename Container, typename Value > +class enumerable_thread_specific_iterator +{ + //! current position in the concurrent_vector + + Container *my_container; + typename Container::size_type my_index; + mutable Value *my_value; + + template + friend bool operator==( const enumerable_thread_specific_iterator& i, + const enumerable_thread_specific_iterator& j ); + + template + friend bool operator<( const enumerable_thread_specific_iterator& i, + const enumerable_thread_specific_iterator& j ); + + template + friend std::ptrdiff_t operator-( const enumerable_thread_specific_iterator& i, + const enumerable_thread_specific_iterator& j ); + + template + friend class enumerable_thread_specific_iterator; + +public: + //! STL support + using difference_type = std::ptrdiff_t; + using value_type = Value; + using pointer = Value*; + using reference = Value&; + using iterator_category = std::random_access_iterator_tag; + + enumerable_thread_specific_iterator( const Container &container, typename Container::size_type index ) : + my_container(&const_cast(container)), my_index(index), my_value(nullptr) {} + + //! Default constructor + enumerable_thread_specific_iterator() : my_container(nullptr), my_index(0), my_value(nullptr) {} + + template + enumerable_thread_specific_iterator( const enumerable_thread_specific_iterator& other ) : + my_container( other.my_container ), my_index( other.my_index), my_value( const_cast(other.my_value) ) {} + + enumerable_thread_specific_iterator operator+( std::ptrdiff_t offset ) const { + return enumerable_thread_specific_iterator(*my_container, my_index + offset); + } + + friend enumerable_thread_specific_iterator operator+( std::ptrdiff_t offset, enumerable_thread_specific_iterator v ) { + return enumerable_thread_specific_iterator(*v.my_container, v.my_index + offset); + } + + enumerable_thread_specific_iterator &operator+=( std::ptrdiff_t offset ) { + my_index += offset; + my_value = nullptr; + return *this; + } + + enumerable_thread_specific_iterator operator-( std::ptrdiff_t offset ) const { + return enumerable_thread_specific_iterator( *my_container, my_index-offset ); + } + + enumerable_thread_specific_iterator &operator-=( std::ptrdiff_t offset ) { + my_index -= offset; + my_value = nullptr; + return *this; + } + + Value& operator*() const { + Value* value = my_value; + if( !value ) { + value = my_value = (*my_container)[my_index].value(); + } + __TBB_ASSERT( value==(*my_container)[my_index].value(), "corrupt cache" ); + return *value; + } + + Value& operator[]( std::ptrdiff_t k ) const { + return *(*my_container)[my_index + k].value(); + } + + Value* operator->() const {return &operator*();} + + enumerable_thread_specific_iterator& operator++() { + ++my_index; + my_value = nullptr; + return *this; + } + + enumerable_thread_specific_iterator& operator--() { + --my_index; + my_value = nullptr; + return *this; + } + + //! Post increment + enumerable_thread_specific_iterator operator++(int) { + enumerable_thread_specific_iterator result = *this; + ++my_index; + my_value = nullptr; + return result; + } + + //! Post decrement + enumerable_thread_specific_iterator operator--(int) { + enumerable_thread_specific_iterator result = *this; + --my_index; + my_value = nullptr; + return result; + } +}; + +template +bool operator==( const enumerable_thread_specific_iterator& i, + const enumerable_thread_specific_iterator& j ) { + return i.my_index == j.my_index && i.my_container == j.my_container; +} + +template +bool operator!=( const enumerable_thread_specific_iterator& i, + const enumerable_thread_specific_iterator& j ) { + return !(i==j); +} + +template +bool operator<( const enumerable_thread_specific_iterator& i, + const enumerable_thread_specific_iterator& j ) { + return i.my_index +bool operator>( const enumerable_thread_specific_iterator& i, + const enumerable_thread_specific_iterator& j ) { + return j +bool operator>=( const enumerable_thread_specific_iterator& i, + const enumerable_thread_specific_iterator& j ) { + return !(i +bool operator<=( const enumerable_thread_specific_iterator& i, + const enumerable_thread_specific_iterator& j ) { + return !(j +std::ptrdiff_t operator-( const enumerable_thread_specific_iterator& i, + const enumerable_thread_specific_iterator& j ) { + return i.my_index-j.my_index; +} + +template +class segmented_iterator +{ + template + friend bool operator==(const segmented_iterator& i, const segmented_iterator& j); + + template + friend bool operator!=(const segmented_iterator& i, const segmented_iterator& j); + + template + friend class segmented_iterator; + +public: + segmented_iterator() {my_segcont = nullptr;} + + segmented_iterator( const SegmentedContainer& _segmented_container ) : + my_segcont(const_cast(&_segmented_container)), + outer_iter(my_segcont->end()) { } + + ~segmented_iterator() {} + + using InnerContainer = typename SegmentedContainer::value_type; + using inner_iterator = typename InnerContainer::iterator; + using outer_iterator = typename SegmentedContainer::iterator; + + // STL support + // TODO: inherit all types from segmented container? + using difference_type = std::ptrdiff_t; + using value_type = Value; + using size_type = typename SegmentedContainer::size_type; + using pointer = Value*; + using reference = Value&; + using iterator_category = std::input_iterator_tag; + + // Copy Constructor + template + segmented_iterator(const segmented_iterator& other) : + my_segcont(other.my_segcont), + outer_iter(other.outer_iter), + // can we assign a default-constructed iterator to inner if we're at the end? + inner_iter(other.inner_iter) + {} + + // assignment + template + segmented_iterator& operator=( const segmented_iterator& other) { + my_segcont = other.my_segcont; + outer_iter = other.outer_iter; + if(outer_iter != my_segcont->end()) inner_iter = other.inner_iter; + return *this; + } + + // allow assignment of outer iterator to segmented iterator. Once it is + // assigned, move forward until a non-empty inner container is found or + // the end of the outer container is reached. + segmented_iterator& operator=(const outer_iterator& new_outer_iter) { + __TBB_ASSERT(my_segcont != nullptr, nullptr); + // check that this iterator points to something inside the segmented container + for(outer_iter = new_outer_iter ;outer_iter!=my_segcont->end(); ++outer_iter) { + if( !outer_iter->empty() ) { + inner_iter = outer_iter->begin(); + break; + } + } + return *this; + } + + // pre-increment + segmented_iterator& operator++() { + advance_me(); + return *this; + } + + // post-increment + segmented_iterator operator++(int) { + segmented_iterator tmp = *this; + operator++(); + return tmp; + } + + bool operator==(const outer_iterator& other_outer) const { + __TBB_ASSERT(my_segcont != nullptr, nullptr); + return (outer_iter == other_outer && + (outer_iter == my_segcont->end() || inner_iter == outer_iter->begin())); + } + + bool operator!=(const outer_iterator& other_outer) const { + return !operator==(other_outer); + + } + + // (i)* RHS + reference operator*() const { + __TBB_ASSERT(my_segcont != nullptr, nullptr); + __TBB_ASSERT(outer_iter != my_segcont->end(), "Dereferencing a pointer at end of container"); + __TBB_ASSERT(inner_iter != outer_iter->end(), nullptr); // should never happen + return *inner_iter; + } + + // i-> + pointer operator->() const { return &operator*();} + +private: + SegmentedContainer* my_segcont; + outer_iterator outer_iter; + inner_iterator inner_iter; + + void advance_me() { + __TBB_ASSERT(my_segcont != nullptr, nullptr); + __TBB_ASSERT(outer_iter != my_segcont->end(), nullptr); // not true if there are no inner containers + __TBB_ASSERT(inner_iter != outer_iter->end(), nullptr); // not true if the inner containers are all empty. + ++inner_iter; + while(inner_iter == outer_iter->end() && ++outer_iter != my_segcont->end()) { + inner_iter = outer_iter->begin(); + } + } +}; // segmented_iterator + +template +bool operator==( const segmented_iterator& i, + const segmented_iterator& j ) { + if(i.my_segcont != j.my_segcont) return false; + if(i.my_segcont == nullptr) return true; + if(i.outer_iter != j.outer_iter) return false; + if(i.outer_iter == i.my_segcont->end()) return true; + return i.inner_iter == j.inner_iter; +} + +// != +template +bool operator!=( const segmented_iterator& i, + const segmented_iterator& j ) { + return !(i==j); +} + +template +struct construct_by_default: no_assign { + void construct(void*where) {new(where) T();} // C++ note: the () in T() ensure zero initialization. + construct_by_default( int ) {} +}; + +template +struct construct_by_exemplar: no_assign { + const T exemplar; + void construct(void*where) {new(where) T(exemplar);} + construct_by_exemplar( const T& t ) : exemplar(t) {} + construct_by_exemplar( T&& t ) : exemplar(std::move(t)) {} +}; + +template +struct construct_by_finit: no_assign { + Finit f; + void construct(void* where) {new(where) T(f());} + construct_by_finit( Finit&& f_ ) : f(std::move(f_)) {} +}; + +template +struct construct_by_args: no_assign { + stored_pack pack; + void construct(void* where) { + call( [where](const typename std::decay

::type&... args ){ + new(where) T(args...); + }, pack ); + } + construct_by_args( P&& ... args ) : pack(std::forward

(args)...) {} +}; + +// storage for initialization function pointer +// TODO: consider removing the template parameter T here and in callback_leaf +class callback_base { +public: + // Clone *this + virtual callback_base* clone() const = 0; + // Destruct and free *this + virtual void destroy() = 0; + // Need virtual destructor to satisfy GCC compiler warning + virtual ~callback_base() { } + // Construct T at where + virtual void construct(void* where) = 0; +}; + +template +class callback_leaf: public callback_base, Constructor { + template callback_leaf( P&& ... params ) : Constructor(std::forward

(params)...) {} + // TODO: make the construction/destruction consistent (use allocator.construct/destroy) + using my_allocator_type = typename tbb::tbb_allocator; + + callback_base* clone() const override { + return make(*this); + } + + void destroy() override { + my_allocator_type alloc; + tbb::detail::allocator_traits::destroy(alloc, this); + tbb::detail::allocator_traits::deallocate(alloc, this, 1); + } + + void construct(void* where) override { + Constructor::construct(where); + } + +public: + template + static callback_base* make( P&& ... params ) { + void* where = my_allocator_type().allocate(1); + return new(where) callback_leaf( std::forward

(params)... ); + } +}; + +//! Template for recording construction of objects in table +/** All maintenance of the space will be done explicitly on push_back, + and all thread local copies must be destroyed before the concurrent + vector is deleted. + + The flag is_built is initialized to false. When the local is + successfully-constructed, set the flag to true or call value_committed(). + If the constructor throws, the flag will be false. +*/ +template +struct ets_element { + detail::aligned_space my_space; + bool is_built; + ets_element() { is_built = false; } // not currently-built + U* value() { return my_space.begin(); } + U* value_committed() { is_built = true; return my_space.begin(); } + ~ets_element() { + if(is_built) { + my_space.begin()->~U(); + is_built = false; + } + } +}; + +// A predicate that can be used for a compile-time compatibility check of ETS instances +// Ideally, it should have been declared inside the ETS class, but unfortunately +// in that case VS2013 does not enable the variadic constructor. +template struct is_compatible_ets : std::false_type {}; +template +struct is_compatible_ets< T, enumerable_thread_specific > : std::is_same {}; + +// A predicate that checks whether, for a variable 'foo' of type T, foo() is a valid expression +template using has_empty_braces_operator = decltype(std::declval()()); +template using is_callable_no_args = supports; + +//! The enumerable_thread_specific container +/** enumerable_thread_specific has the following properties: + - thread-local copies are lazily created, with default, exemplar or function initialization. + - thread-local copies do not move (during lifetime, and excepting clear()) so the address of a copy is invariant. + - the contained objects need not have operator=() defined if combine is not used. + - enumerable_thread_specific containers may be copy-constructed or assigned. + - thread-local copies can be managed by hash-table, or can be accessed via TLS storage for speed. + - outside of parallel contexts, the contents of all thread-local copies are accessible by iterator or using combine or combine_each methods + +@par Segmented iterator + When the thread-local objects are containers with input_iterators defined, a segmented iterator may + be used to iterate over all the elements of all thread-local copies. + +@par combine and combine_each + - Both methods are defined for enumerable_thread_specific. + - combine() requires the type T have operator=() defined. + - neither method modifies the contents of the object (though there is no guarantee that the applied methods do not modify the object.) + - Both are evaluated in serial context (the methods are assumed to be non-benign.) + +@ingroup containers */ +template , + ets_key_usage_type ETS_key_type=ets_no_key > +class enumerable_thread_specific: ets_base { + + template friend class enumerable_thread_specific; + + using padded_element = padded>; + + //! A generic range, used to create range objects from the iterators + template + class generic_range_type: public blocked_range { + public: + using value_type = T; + using reference = T&; + using const_reference = const T&; + using iterator = I; + using difference_type = std::ptrdiff_t; + + generic_range_type( I begin_, I end_, std::size_t grainsize_ = 1) : blocked_range(begin_,end_,grainsize_) {} + template + generic_range_type( const generic_range_type& r) : blocked_range(r.begin(),r.end(),r.grainsize()) {} + generic_range_type( generic_range_type& r, split ) : blocked_range(r,split()) {} + }; + + using allocator_traits_type = tbb::detail::allocator_traits; + + using padded_allocator_type = typename allocator_traits_type::template rebind_alloc; + using internal_collection_type = tbb::concurrent_vector< padded_element, padded_allocator_type >; + + callback_base *my_construct_callback; + + internal_collection_type my_locals; + + // TODO: consider unifying the callback mechanism for all create_local* methods below + // (likely non-compatible and requires interface version increase) + void* create_local() override { + padded_element& lref = *my_locals.grow_by(1); + my_construct_callback->construct(lref.value()); + return lref.value_committed(); + } + + static void* create_local_by_copy( ets_base& base, void* p ) { + enumerable_thread_specific& ets = static_cast(base); + padded_element& lref = *ets.my_locals.grow_by(1); + new(lref.value()) T(*static_cast(p)); + return lref.value_committed(); + } + + static void* create_local_by_move( ets_base& base, void* p ) { + enumerable_thread_specific& ets = static_cast(base); + padded_element& lref = *ets.my_locals.grow_by(1); + new(lref.value()) T(std::move(*static_cast(p))); + return lref.value_committed(); + } + + using array_allocator_type = typename allocator_traits_type::template rebind_alloc; + + // _size is in bytes + void* create_array(std::size_t _size) override { + std::size_t nelements = (_size + sizeof(uintptr_t) -1) / sizeof(uintptr_t); + return array_allocator_type().allocate(nelements); + } + + void free_array( void* _ptr, std::size_t _size) override { + std::size_t nelements = (_size + sizeof(uintptr_t) -1) / sizeof(uintptr_t); + array_allocator_type().deallocate( reinterpret_cast(_ptr),nelements); + } + +public: + + //! Basic types + using value_type = T; + using allocator_type = Allocator; + using size_type = typename internal_collection_type::size_type; + using difference_type = typename internal_collection_type::difference_type; + using reference = value_type&; + using const_reference = const value_type&; + + using pointer = typename allocator_traits_type::pointer; + using const_pointer = typename allocator_traits_type::const_pointer; + + // Iterator types + using iterator = enumerable_thread_specific_iterator; + using const_iterator = enumerable_thread_specific_iterator; + + // Parallel range types + using range_type = generic_range_type; + using const_range_type = generic_range_type; + + //! Default constructor. Each local instance of T is default constructed. + enumerable_thread_specific() : my_construct_callback( + callback_leaf >::make(/*dummy argument*/0) + ){} + + //! Constructor with initializer functor. Each local instance of T is constructed by T(finit()). + template ::type>::value>::type> + explicit enumerable_thread_specific( Finit finit ) : my_construct_callback( + callback_leaf >::make( std::move(finit) ) + ){} + + //! Constructor with exemplar. Each local instance of T is copy-constructed from the exemplar. + explicit enumerable_thread_specific( const T& exemplar ) : my_construct_callback( + callback_leaf >::make( exemplar ) + ){} + + explicit enumerable_thread_specific( T&& exemplar ) : my_construct_callback( + callback_leaf >::make( std::move(exemplar) ) + ){} + + //! Variadic constructor with initializer arguments. Each local instance of T is constructed by T(args...) + template ::type>::value + && !is_compatible_ets::type>::value + && !std::is_same::type>::value + >::type> + enumerable_thread_specific( P1&& arg1, P&& ... args ) : my_construct_callback( + callback_leaf >::make( std::forward(arg1), std::forward

(args)... ) + ){} + + //! Destructor + ~enumerable_thread_specific() { + if(my_construct_callback) my_construct_callback->destroy(); + // Deallocate the hash table before overridden free_array() becomes inaccessible + this->ets_base::table_clear(); + } + + //! returns reference to local, discarding exists + reference local() { + bool exists; + return local(exists); + } + + //! Returns reference to calling thread's local copy, creating one if necessary + reference local(bool& exists) { + void* ptr = this->table_lookup(exists); + return *(T*)ptr; + } + + //! Get the number of local copies + size_type size() const { return my_locals.size(); } + + //! true if there have been no local copies created + bool empty() const { return my_locals.empty(); } + + //! begin iterator + iterator begin() { return iterator( my_locals, 0 ); } + //! end iterator + iterator end() { return iterator(my_locals, my_locals.size() ); } + + //! begin const iterator + const_iterator begin() const { return const_iterator(my_locals, 0); } + + //! end const iterator + const_iterator end() const { return const_iterator(my_locals, my_locals.size()); } + + //! Get range for parallel algorithms + range_type range( std::size_t grainsize=1 ) { return range_type( begin(), end(), grainsize ); } + + //! Get const range for parallel algorithms + const_range_type range( std::size_t grainsize=1 ) const { return const_range_type( begin(), end(), grainsize ); } + + //! Destroys local copies + void clear() { + my_locals.clear(); + this->table_clear(); + // callback is not destroyed + } + +private: + template + void internal_copy(const enumerable_thread_specific& other) { + // this tests is_compatible_ets + static_assert( (is_compatible_ets::type>::value), "is_compatible_ets fails" ); + // Initialize my_construct_callback first, so that it is valid even if rest of this routine throws an exception. + my_construct_callback = other.my_construct_callback->clone(); + __TBB_ASSERT(my_locals.size()==0, nullptr); + my_locals.reserve(other.size()); + this->table_elementwise_copy( other, create_local_by_copy ); + } + + void internal_swap(enumerable_thread_specific& other) { + using std::swap; + __TBB_ASSERT( this!=&other, nullptr); + swap(my_construct_callback, other.my_construct_callback); + // concurrent_vector::swap() preserves storage space, + // so addresses to the vector kept in ETS hash table remain valid. + swap(my_locals, other.my_locals); + this->ets_base::table_swap(other); + } + + template + void internal_move(enumerable_thread_specific&& other) { + static_assert( (is_compatible_ets::type>::value), "is_compatible_ets fails" ); + my_construct_callback = other.my_construct_callback; + other.my_construct_callback = nullptr; + __TBB_ASSERT(my_locals.size()==0, nullptr); + my_locals.reserve(other.size()); + this->table_elementwise_copy( other, create_local_by_move ); + } + +public: + enumerable_thread_specific( const enumerable_thread_specific& other ) + : ets_base() /* prevents GCC warnings with -Wextra */ + { + internal_copy(other); + } + + template + enumerable_thread_specific( const enumerable_thread_specific& other ) + { + internal_copy(other); + } + + enumerable_thread_specific( enumerable_thread_specific&& other ) : my_construct_callback() + { + // TODO: use internal_move correctly here + internal_swap(other); + } + + template + enumerable_thread_specific( enumerable_thread_specific&& other ) : my_construct_callback() + { + internal_move(std::move(other)); + } + + enumerable_thread_specific& operator=( const enumerable_thread_specific& other ) + { + if( this != &other ) { + this->clear(); + my_construct_callback->destroy(); + internal_copy( other ); + } + return *this; + } + + template + enumerable_thread_specific& operator=( const enumerable_thread_specific& other ) + { + __TBB_ASSERT( static_cast(this)!=static_cast(&other), nullptr); // Objects of different types + this->clear(); + my_construct_callback->destroy(); + internal_copy(other); + return *this; + } + + enumerable_thread_specific& operator=( enumerable_thread_specific&& other ) + { + if( this != &other ) { + // TODO: use internal_move correctly here + internal_swap(other); + } + return *this; + } + + template + enumerable_thread_specific& operator=( enumerable_thread_specific&& other ) + { + __TBB_ASSERT( static_cast(this)!=static_cast(&other), nullptr); // Objects of different types + this->clear(); + my_construct_callback->destroy(); + internal_move(std::move(other)); + return *this; + } + + // CombineFunc has signature T(T,T) or T(const T&, const T&) + template + T combine(CombineFunc f_combine) { + if(begin() == end()) { + ets_element location; + my_construct_callback->construct(location.value()); + return *location.value_committed(); + } + const_iterator ci = begin(); + T my_result = *ci; + while(++ci != end()) + my_result = f_combine( my_result, *ci ); + return my_result; + } + + // combine_func_t takes T by value or by [const] reference, and returns nothing + template + void combine_each(CombineFunc f_combine) { + for(iterator ci = begin(); ci != end(); ++ci) { + f_combine( *ci ); + } + } + +}; // enumerable_thread_specific + +template< typename Container > +class flattened2d { + // This intermediate typedef is to address issues with VC7.1 compilers + using conval_type = typename Container::value_type; + +public: + //! Basic types + using size_type = typename conval_type::size_type; + using difference_type = typename conval_type::difference_type; + using allocator_type = typename conval_type::allocator_type; + using value_type = typename conval_type::value_type; + using reference = typename conval_type::reference; + using const_reference = typename conval_type::const_reference; + using pointer = typename conval_type::pointer; + using const_pointer = typename conval_type::const_pointer; + + using iterator = segmented_iterator; + using const_iterator = segmented_iterator; + + flattened2d( const Container &c, typename Container::const_iterator b, typename Container::const_iterator e ) : + my_container(const_cast(&c)), my_begin(b), my_end(e) { } + + explicit flattened2d( const Container &c ) : + my_container(const_cast(&c)), my_begin(c.begin()), my_end(c.end()) { } + + iterator begin() { return iterator(*my_container) = my_begin; } + iterator end() { return iterator(*my_container) = my_end; } + const_iterator begin() const { return const_iterator(*my_container) = my_begin; } + const_iterator end() const { return const_iterator(*my_container) = my_end; } + + size_type size() const { + size_type tot_size = 0; + for(typename Container::const_iterator i = my_begin; i != my_end; ++i) { + tot_size += i->size(); + } + return tot_size; + } + +private: + Container *my_container; + typename Container::const_iterator my_begin; + typename Container::const_iterator my_end; +}; + +template +flattened2d flatten2d(const Container &c, const typename Container::const_iterator b, const typename Container::const_iterator e) { + return flattened2d(c, b, e); +} + +template +flattened2d flatten2d(const Container &c) { + return flattened2d(c); +} + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::enumerable_thread_specific; +using detail::d1::flattened2d; +using detail::d1::flatten2d; +// ets enum keys +using detail::d1::ets_key_usage_type; +using detail::d1::ets_key_per_instance; +using detail::d1::ets_no_key; +#if __TBB_RESUMABLE_TASKS +using detail::d1::ets_suspend_aware; +#endif +} // inline namespace v1 + +} // namespace tbb + +#endif // __TBB_enumerable_thread_specific_H + diff --git a/third_party/tbb/environment.h b/third_party/tbb/environment.h new file mode 100644 index 000000000..15442a7f6 --- /dev/null +++ b/third_party/tbb/environment.h @@ -0,0 +1,82 @@ +// clang-format off +/* + Copyright (c) 2018-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_tbb_environment_H +#define __TBB_tbb_environment_H + +#include "third_party/libcxx/cstdlib" +#include "third_party/libcxx/cstring" +#include "third_party/libcxx/cerrno" +#include "third_party/libcxx/cctype" + +namespace tbb { +namespace detail { +namespace r1 { + +#if __TBB_WIN8UI_SUPPORT +static inline bool GetBoolEnvironmentVariable( const char * ) { + return false; +} + +static inline long GetIntegralEnvironmentVariable( const char * ) { + return -1; +} +#else /* __TBB_WIN8UI_SUPPORT */ +static inline bool GetBoolEnvironmentVariable( const char * name ) { + if ( const char* s = std::getenv(name) ) { + // The result is defined as true only if the environment variable contains + // no characters except one '1' character and an arbitrary number of spaces + // (including the absence of spaces). + size_t index = std::strspn(s, " "); + if (s[index] != '1') return false; + index++; + // Memory access after incrementing is safe, since the getenv() returns a + // null-terminated string, and even if the character getting by index is '1', + // and this character is the end of string, after incrementing we will get + // an index of character, that contains '\0' + index += std::strspn(&s[index], " "); + return !s[index]; + } + return false; +} + +static inline long GetIntegralEnvironmentVariable( const char * name ) { + if ( const char* s = std::getenv(name) ) { + char* end = nullptr; + errno = 0; + long value = std::strtol(s, &end, 10); + + // We have exceeded the range, value is negative or string is incovertable + if ( errno == ERANGE || value < 0 || end==s ) { + return -1; + } + for ( ; *end != '\0'; end++ ) { + if ( !std::isspace(*end) ) { + return -1; + } + } + return value; + } + return -1; +} +#endif /* __TBB_WIN8UI_SUPPORT */ + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_tbb_environment_H diff --git a/third_party/tbb/exception.cpp b/third_party/tbb/exception.cpp new file mode 100644 index 000000000..7668598f4 --- /dev/null +++ b/third_party/tbb/exception.cpp @@ -0,0 +1,167 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_template_helpers.h" + +#include "third_party/libcxx/cstring" +#include "third_party/libcxx/cstdio" +#include "third_party/libcxx/stdexcept" // std::runtime_error +#include "third_party/libcxx/new" +#include "third_party/libcxx/stdexcept" + +#define __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN \ + (__GLIBCXX__ && __TBB_GLIBCXX_VERSION>=40700 && __TBB_GLIBCXX_VERSION<60000 && TBB_USE_EXCEPTIONS) + +#if __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN +// GCC ABI declarations necessary for a workaround +// MISSING #include +#endif + +namespace tbb { +namespace detail { +namespace r1 { + +const char* bad_last_alloc::what() const noexcept(true) { return "bad allocation in previous or concurrent attempt"; } +const char* user_abort::what() const noexcept(true) { return "User-initiated abort has terminated this operation"; } +const char* missing_wait::what() const noexcept(true) { return "wait() was not called on the structured_task_group"; } + +#if TBB_USE_EXCEPTIONS + template + /*[[noreturn]]*/ void do_throw_noexcept(F throw_func) noexcept { + throw_func(); + } + + /*[[noreturn]]*/ void do_throw_noexcept(void (*throw_func)()) noexcept { + throw_func(); +#if __GNUC__ == 7 + // In release, GCC 7 loses noexcept attribute during tail call optimization. + // The following statement prevents tail call optimization. + volatile bool reach_this_point = true; + suppress_unused_warning(reach_this_point); +#endif + } + + bool terminate_on_exception(); // defined in global_control.cpp and ipc_server.cpp + + template + /*[[noreturn]]*/ void do_throw(F throw_func) { + if (terminate_on_exception()) { + do_throw_noexcept(throw_func); + } + throw_func(); + } + + #define DO_THROW(exc, init_args) do_throw( []{ throw exc init_args; } ); +#else /* !TBB_USE_EXCEPTIONS */ + #define PRINT_ERROR_AND_ABORT(exc_name, msg) \ + std::fprintf (stderr, "Exception %s with message %s would have been thrown, " \ + "if exception handling had not been disabled. Aborting.\n", exc_name, msg); \ + std::fflush(stderr); \ + std::abort(); + #define DO_THROW(exc, init_args) PRINT_ERROR_AND_ABORT(#exc, #init_args) +#endif /* !TBB_USE_EXCEPTIONS */ + +void throw_exception ( exception_id eid ) { + switch ( eid ) { + case exception_id::bad_alloc: DO_THROW(std::bad_alloc, ()); break; + case exception_id::bad_last_alloc: DO_THROW(bad_last_alloc, ()); break; + case exception_id::user_abort: DO_THROW( user_abort, () ); break; + case exception_id::nonpositive_step: DO_THROW(std::invalid_argument, ("Step must be positive") ); break; + case exception_id::out_of_range: DO_THROW(std::out_of_range, ("Index out of requested size range")); break; + case exception_id::reservation_length_error: DO_THROW(std::length_error, ("Attempt to exceed implementation defined length limits")); break; + case exception_id::missing_wait: DO_THROW(missing_wait, ()); break; + case exception_id::invalid_load_factor: DO_THROW(std::out_of_range, ("Invalid hash load factor")); break; + case exception_id::invalid_key: DO_THROW(std::out_of_range, ("invalid key")); break; + case exception_id::bad_tagged_msg_cast: DO_THROW(std::runtime_error, ("Illegal tagged_msg cast")); break; + case exception_id::unsafe_wait: DO_THROW(unsafe_wait, ("Unsafe to wait further")); break; + default: __TBB_ASSERT ( false, "Unknown exception ID" ); + } + __TBB_ASSERT(false, "Unreachable code"); +} + +/* The "what" should be fairly short, not more than about 128 characters. + Because we control all the call sites to handle_perror, it is pointless + to bullet-proof it for very long strings. + + Design note: ADR put this routine off to the side in tbb_misc.cpp instead of + Task.cpp because the throw generates a pathetic lot of code, and ADR wanted + this large chunk of code to be placed on a cold page. */ +void handle_perror( int error_code, const char* what ) { + const int BUF_SIZE = 255; + char buf[BUF_SIZE + 1] = { 0 }; + std::strncat(buf, what, BUF_SIZE); + std::size_t buf_len = std::strlen(buf); + if (error_code) { + std::strncat(buf, ": ", BUF_SIZE - buf_len); + buf_len = std::strlen(buf); + std::strncat(buf, std::strerror(error_code), BUF_SIZE - buf_len); + buf_len = std::strlen(buf); + } + __TBB_ASSERT(buf_len <= BUF_SIZE && buf[buf_len] == 0, nullptr); +#if TBB_USE_EXCEPTIONS + do_throw([&buf] { throw std::runtime_error(buf); }); +#else + PRINT_ERROR_AND_ABORT( "runtime_error", buf); +#endif /* !TBB_USE_EXCEPTIONS */ +} + +#if __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN +// Runtime detection and workaround for the GCC bug 62258. +// The problem is that std::rethrow_exception() does not increment a counter +// of active exceptions, causing std::uncaught_exception() to return a wrong value. +// The code is created after, and roughly reflects, the workaround +// at https://gcc.gnu.org/bugzilla/attachment.cgi?id=34683 + +void fix_broken_rethrow() { + struct gcc_eh_data { + void * caughtExceptions; + unsigned int uncaughtExceptions; + }; + gcc_eh_data* eh_data = punned_cast( abi::__cxa_get_globals() ); + ++eh_data->uncaughtExceptions; +} + +bool gcc_rethrow_exception_broken() { + bool is_broken; + __TBB_ASSERT( !std::uncaught_exception(), + "gcc_rethrow_exception_broken() must not be called when an exception is active" ); + try { + // Throw, catch, and rethrow an exception + try { + throw __TBB_GLIBCXX_VERSION; + } catch(...) { + std::rethrow_exception( std::current_exception() ); + } + } catch(...) { + // Check the bug presence + is_broken = std::uncaught_exception(); + } + if( is_broken ) fix_broken_rethrow(); + __TBB_ASSERT( !std::uncaught_exception(), nullptr); + return is_broken; +} +#else +void fix_broken_rethrow() {} +bool gcc_rethrow_exception_broken() { return false; } +#endif /* __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN */ + +} // namespace r1 +} // namespace detail +} // namespace tbb + diff --git a/third_party/tbb/flow_graph.h b/third_party/tbb/flow_graph.h new file mode 100644 index 000000000..7b0343ddc --- /dev/null +++ b/third_party/tbb/flow_graph.h @@ -0,0 +1,3377 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_flow_graph_H +#define __TBB_flow_graph_H + +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/memory" +#include "third_party/libcxx/type_traits" + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/spin_mutex.h" +#include "third_party/tbb/null_mutex.h" +#include "third_party/tbb/spin_rw_mutex.h" +#include "third_party/tbb/null_rw_mutex.h" +#include "third_party/tbb/detail/_pipeline_filters.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/detail/_small_object_pool.h" +#include "third_party/tbb/cache_aligned_allocator.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_aggregator.h" +#include "third_party/tbb/detail/_allocator_traits.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/profiling.h" +#include "third_party/tbb/task_arena.h" + +#if TBB_USE_PROFILING_TOOLS && ( __unix__ || __APPLE__ ) + #if __INTEL_COMPILER + // Disabled warning "routine is both inline and noinline" + #pragma warning (push) + #pragma warning( disable: 2196 ) + #endif + #define __TBB_NOINLINE_SYM __attribute__((noinline)) +#else + #define __TBB_NOINLINE_SYM +#endif + +#include "third_party/libcxx/tuple" +#include "third_party/libcxx/list" +#include "third_party/libcxx/queue" +#if __TBB_CPP20_CONCEPTS_PRESENT +// MISSING #include +#endif + +/** @file + \brief The graph related classes and functions + + There are some applications that best express dependencies as messages + passed between nodes in a graph. These messages may contain data or + simply act as signals that a predecessors has completed. The graph + class and its associated node classes can be used to express such + applications. +*/ + +namespace tbb { +namespace detail { + +namespace d1 { + +//! An enumeration the provides the two most common concurrency levels: unlimited and serial +enum concurrency { unlimited = 0, serial = 1 }; + +//! A generic null type +struct null_type {}; + +//! An empty class used for messages that mean "I'm done" +class continue_msg {}; + +} // namespace d1 + +#if __TBB_CPP20_CONCEPTS_PRESENT +namespace d0 { + +template +concept node_body_return_type = std::same_as || + std::convertible_to; + +// TODO: consider using std::invocable here +template +concept continue_node_body = std::copy_constructible && + requires( Body& body, const tbb::detail::d1::continue_msg& v ) { + { body(v) } -> node_body_return_type; + }; + +template +concept function_node_body = std::copy_constructible && + std::invocable && + node_body_return_type, Output>; + +template +concept join_node_function_object = std::copy_constructible && + std::invocable && + std::convertible_to, Key>; + +template +concept input_node_body = std::copy_constructible && + requires( Body& body, tbb::detail::d1::flow_control& fc ) { + { body(fc) } -> adaptive_same_as; + }; + +template +concept multifunction_node_body = std::copy_constructible && + std::invocable; + +template +concept sequencer = std::copy_constructible && + std::invocable && + std::convertible_to, std::size_t>; + +template +concept async_node_body = std::copy_constructible && + std::invocable; + +} // namespace d0 +#endif // __TBB_CPP20_CONCEPTS_PRESENT + +namespace d1 { + +//! Forward declaration section +template< typename T > class sender; +template< typename T > class receiver; +class continue_receiver; + +template< typename T, typename U > class limiter_node; // needed for resetting decrementer + +template class successor_cache; +template class broadcast_cache; +template class round_robin_cache; +template class predecessor_cache; +template class reservable_predecessor_cache; + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET +namespace order { +struct following; +struct preceding; +} +template struct node_set; +#endif + + +} // namespace d1 +} // namespace detail +} // namespace tbb + +//! The graph class +#include "third_party/tbb/detail/_flow_graph_impl.h" + +namespace tbb { +namespace detail { +namespace d1 { + +static inline std::pair order_tasks(graph_task* first, graph_task* second) { + if (second->priority > first->priority) + return std::make_pair(second, first); + return std::make_pair(first, second); +} + +// submit task if necessary. Returns the non-enqueued task if there is one. +static inline graph_task* combine_tasks(graph& g, graph_task* left, graph_task* right) { + // if no RHS task, don't change left. + if (right == nullptr) return left; + // right != nullptr + if (left == nullptr) return right; + if (left == SUCCESSFULLY_ENQUEUED) return right; + // left contains a task + if (right != SUCCESSFULLY_ENQUEUED) { + // both are valid tasks + auto tasks_pair = order_tasks(left, right); + spawn_in_graph_arena(g, *tasks_pair.first); + return tasks_pair.second; + } + return left; +} + +//! Pure virtual template class that defines a sender of messages of type T +template< typename T > +class sender { +public: + virtual ~sender() {} + + //! Request an item from the sender + virtual bool try_get( T & ) { return false; } + + //! Reserves an item in the sender + virtual bool try_reserve( T & ) { return false; } + + //! Releases the reserved item + virtual bool try_release( ) { return false; } + + //! Consumes the reserved item + virtual bool try_consume( ) { return false; } + +protected: + //! The output type of this sender + typedef T output_type; + + //! The successor type for this node + typedef receiver successor_type; + + //! Add a new successor to this node + virtual bool register_successor( successor_type &r ) = 0; + + //! Removes a successor from this node + virtual bool remove_successor( successor_type &r ) = 0; + + template + friend bool register_successor(sender& s, receiver& r); + + template + friend bool remove_successor (sender& s, receiver& r); +}; // class sender + +template +bool register_successor(sender& s, receiver& r) { + return s.register_successor(r); +} + +template +bool remove_successor(sender& s, receiver& r) { + return s.remove_successor(r); +} + +//! Pure virtual template class that defines a receiver of messages of type T +template< typename T > +class receiver { +public: + //! Destructor + virtual ~receiver() {} + + //! Put an item to the receiver + bool try_put( const T& t ) { + graph_task *res = try_put_task(t); + if (!res) return false; + if (res != SUCCESSFULLY_ENQUEUED) spawn_in_graph_arena(graph_reference(), *res); + return true; + } + + //! put item to successor; return task to run the successor if possible. +protected: + //! The input type of this receiver + typedef T input_type; + + //! The predecessor type for this node + typedef sender predecessor_type; + + template< typename R, typename B > friend class run_and_put_task; + template< typename X, typename Y > friend class broadcast_cache; + template< typename X, typename Y > friend class round_robin_cache; + virtual graph_task *try_put_task(const T& t) = 0; + virtual graph& graph_reference() const = 0; + + template friend class successor_cache; + virtual bool is_continue_receiver() { return false; } + + // TODO revamp: reconsider the inheritance and move node priority out of receiver + virtual node_priority_t priority() const { return no_priority; } + + //! Add a predecessor to the node + virtual bool register_predecessor( predecessor_type & ) { return false; } + + //! Remove a predecessor from the node + virtual bool remove_predecessor( predecessor_type & ) { return false; } + + template + friend bool register_predecessor(receiver& r, sender& s); + template + friend bool remove_predecessor (receiver& r, sender& s); +}; // class receiver + +template +bool register_predecessor(receiver& r, sender& s) { + return r.register_predecessor(s); +} + +template +bool remove_predecessor(receiver& r, sender& s) { + return r.remove_predecessor(s); +} + +//! Base class for receivers of completion messages +/** These receivers automatically reset, but cannot be explicitly waited on */ +class continue_receiver : public receiver< continue_msg > { +protected: + + //! Constructor + explicit continue_receiver( int number_of_predecessors, node_priority_t a_priority ) { + my_predecessor_count = my_initial_predecessor_count = number_of_predecessors; + my_current_count = 0; + my_priority = a_priority; + } + + //! Copy constructor + continue_receiver( const continue_receiver& src ) : receiver() { + my_predecessor_count = my_initial_predecessor_count = src.my_initial_predecessor_count; + my_current_count = 0; + my_priority = src.my_priority; + } + + //! Increments the trigger threshold + bool register_predecessor( predecessor_type & ) override { + spin_mutex::scoped_lock l(my_mutex); + ++my_predecessor_count; + return true; + } + + //! Decrements the trigger threshold + /** Does not check to see if the removal of the predecessor now makes the current count + exceed the new threshold. So removing a predecessor while the graph is active can cause + unexpected results. */ + bool remove_predecessor( predecessor_type & ) override { + spin_mutex::scoped_lock l(my_mutex); + --my_predecessor_count; + return true; + } + + //! The input type + typedef continue_msg input_type; + + //! The predecessor type for this node + typedef receiver::predecessor_type predecessor_type; + + template< typename R, typename B > friend class run_and_put_task; + template friend class broadcast_cache; + template friend class round_robin_cache; + // execute body is supposed to be too small to create a task for. + graph_task* try_put_task( const input_type & ) override { + { + spin_mutex::scoped_lock l(my_mutex); + if ( ++my_current_count < my_predecessor_count ) + return SUCCESSFULLY_ENQUEUED; + else + my_current_count = 0; + } + graph_task* res = execute(); + return res? res : SUCCESSFULLY_ENQUEUED; + } + + spin_mutex my_mutex; + int my_predecessor_count; + int my_current_count; + int my_initial_predecessor_count; + node_priority_t my_priority; + // the friend declaration in the base class did not eliminate the "protected class" + // error in gcc 4.1.2 + template friend class limiter_node; + + virtual void reset_receiver( reset_flags f ) { + my_current_count = 0; + if (f & rf_clear_edges) { + my_predecessor_count = my_initial_predecessor_count; + } + } + + //! Does whatever should happen when the threshold is reached + /** This should be very fast or else spawn a task. This is + called while the sender is blocked in the try_put(). */ + virtual graph_task* execute() = 0; + template friend class successor_cache; + bool is_continue_receiver() override { return true; } + + node_priority_t priority() const override { return my_priority; } +}; // class continue_receiver + +#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING + template + K key_from_message( const T &t ) { + return t.key(); + } +#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ + +} // d1 +} // detail +} // tbb + +#include "third_party/tbb/detail/_flow_graph_trace_impl.h" +#include "third_party/tbb/detail/_hash_compare.h" + +namespace tbb { +namespace detail { +namespace d1 { + +#include "third_party/tbb/detail/_flow_graph_body_impl.h" +#include "third_party/tbb/detail/_flow_graph_cache_impl.h" +#include "third_party/tbb/detail/_flow_graph_types_impl.h" + +using namespace graph_policy_namespace; + +template +graph_iterator::graph_iterator(C *g, bool begin) : my_graph(g), current_node(nullptr) +{ + if (begin) current_node = my_graph->my_nodes; + //else it is an end iterator by default +} + +template +typename graph_iterator::reference graph_iterator::operator*() const { + __TBB_ASSERT(current_node, "graph_iterator at end"); + return *operator->(); +} + +template +typename graph_iterator::pointer graph_iterator::operator->() const { + return current_node; +} + +template +void graph_iterator::internal_forward() { + if (current_node) current_node = current_node->next; +} + +//! Constructs a graph with isolated task_group_context +inline graph::graph() : my_wait_context(0), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) { + prepare_task_arena(); + own_context = true; + cancelled = false; + caught_exception = false; + my_context = new (r1::cache_aligned_allocate(sizeof(task_group_context))) task_group_context(FLOW_TASKS); + fgt_graph(this); + my_is_active = true; +} + +inline graph::graph(task_group_context& use_this_context) : + my_wait_context(0), my_context(&use_this_context), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) { + prepare_task_arena(); + own_context = false; + cancelled = false; + caught_exception = false; + fgt_graph(this); + my_is_active = true; +} + +inline graph::~graph() { + wait_for_all(); + if (own_context) { + my_context->~task_group_context(); + r1::cache_aligned_deallocate(my_context); + } + delete my_task_arena; +} + +inline void graph::reserve_wait() { + my_wait_context.reserve(); + fgt_reserve_wait(this); +} + +inline void graph::release_wait() { + fgt_release_wait(this); + my_wait_context.release(); +} + +inline void graph::register_node(graph_node *n) { + n->next = nullptr; + { + spin_mutex::scoped_lock lock(nodelist_mutex); + n->prev = my_nodes_last; + if (my_nodes_last) my_nodes_last->next = n; + my_nodes_last = n; + if (!my_nodes) my_nodes = n; + } +} + +inline void graph::remove_node(graph_node *n) { + { + spin_mutex::scoped_lock lock(nodelist_mutex); + __TBB_ASSERT(my_nodes && my_nodes_last, "graph::remove_node: Error: no registered nodes"); + if (n->prev) n->prev->next = n->next; + if (n->next) n->next->prev = n->prev; + if (my_nodes_last == n) my_nodes_last = n->prev; + if (my_nodes == n) my_nodes = n->next; + } + n->prev = n->next = nullptr; +} + +inline void graph::reset( reset_flags f ) { + // reset context + deactivate_graph(*this); + + my_context->reset(); + cancelled = false; + caught_exception = false; + // reset all the nodes comprising the graph + for(iterator ii = begin(); ii != end(); ++ii) { + graph_node *my_p = &(*ii); + my_p->reset_node(f); + } + // Reattach the arena. Might be useful to run the graph in a particular task_arena + // while not limiting graph lifetime to a single task_arena::execute() call. + prepare_task_arena( /*reinit=*/true ); + activate_graph(*this); +} + +inline void graph::cancel() { + my_context->cancel_group_execution(); +} + +inline graph::iterator graph::begin() { return iterator(this, true); } + +inline graph::iterator graph::end() { return iterator(this, false); } + +inline graph::const_iterator graph::begin() const { return const_iterator(this, true); } + +inline graph::const_iterator graph::end() const { return const_iterator(this, false); } + +inline graph::const_iterator graph::cbegin() const { return const_iterator(this, true); } + +inline graph::const_iterator graph::cend() const { return const_iterator(this, false); } + +inline graph_node::graph_node(graph& g) : my_graph(g) { + my_graph.register_node(this); +} + +inline graph_node::~graph_node() { + my_graph.remove_node(this); +} + +#include "third_party/tbb/detail/_flow_graph_node_impl.h" + + +//! An executable node that acts as a source, i.e. it has no predecessors + +template < typename Output > + __TBB_requires(std::copyable) +class input_node : public graph_node, public sender< Output > { +public: + //! The type of the output message, which is complete + typedef Output output_type; + + //! The type of successors of this node + typedef typename sender::successor_type successor_type; + + // Input node has no input type + typedef null_type input_type; + + //! Constructor for a node with a successor + template< typename Body > + __TBB_requires(input_node_body) + __TBB_NOINLINE_SYM input_node( graph &g, Body body ) + : graph_node(g), my_active(false) + , my_body( new input_body_leaf< output_type, Body>(body) ) + , my_init_body( new input_body_leaf< output_type, Body>(body) ) + , my_successors(this), my_reserved(false), my_has_cached_item(false) + { + fgt_node_with_body(CODEPTR(), FLOW_INPUT_NODE, &this->my_graph, + static_cast *>(this), this->my_body); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + __TBB_requires(input_node_body) + input_node( const node_set& successors, Body body ) + : input_node(successors.graph_reference(), body) + { + make_edges(*this, successors); + } +#endif + + //! Copy constructor + __TBB_NOINLINE_SYM input_node( const input_node& src ) + : graph_node(src.my_graph), sender() + , my_active(false) + , my_body(src.my_init_body->clone()), my_init_body(src.my_init_body->clone()) + , my_successors(this), my_reserved(false), my_has_cached_item(false) + { + fgt_node_with_body(CODEPTR(), FLOW_INPUT_NODE, &this->my_graph, + static_cast *>(this), this->my_body); + } + + //! The destructor + ~input_node() { delete my_body; delete my_init_body; } + + //! Add a new successor to this node + bool register_successor( successor_type &r ) override { + spin_mutex::scoped_lock lock(my_mutex); + my_successors.register_successor(r); + if ( my_active ) + spawn_put(); + return true; + } + + //! Removes a successor from this node + bool remove_successor( successor_type &r ) override { + spin_mutex::scoped_lock lock(my_mutex); + my_successors.remove_successor(r); + return true; + } + + //! Request an item from the node + bool try_get( output_type &v ) override { + spin_mutex::scoped_lock lock(my_mutex); + if ( my_reserved ) + return false; + + if ( my_has_cached_item ) { + v = my_cached_item; + my_has_cached_item = false; + return true; + } + // we've been asked to provide an item, but we have none. enqueue a task to + // provide one. + if ( my_active ) + spawn_put(); + return false; + } + + //! Reserves an item. + bool try_reserve( output_type &v ) override { + spin_mutex::scoped_lock lock(my_mutex); + if ( my_reserved ) { + return false; + } + + if ( my_has_cached_item ) { + v = my_cached_item; + my_reserved = true; + return true; + } else { + return false; + } + } + + //! Release a reserved item. + /** true = item has been released and so remains in sender, dest must request or reserve future items */ + bool try_release( ) override { + spin_mutex::scoped_lock lock(my_mutex); + __TBB_ASSERT( my_reserved && my_has_cached_item, "releasing non-existent reservation" ); + my_reserved = false; + if(!my_successors.empty()) + spawn_put(); + return true; + } + + //! Consumes a reserved item + bool try_consume( ) override { + spin_mutex::scoped_lock lock(my_mutex); + __TBB_ASSERT( my_reserved && my_has_cached_item, "consuming non-existent reservation" ); + my_reserved = false; + my_has_cached_item = false; + if ( !my_successors.empty() ) { + spawn_put(); + } + return true; + } + + //! Activates a node that was created in the inactive state + void activate() { + spin_mutex::scoped_lock lock(my_mutex); + my_active = true; + if (!my_successors.empty()) + spawn_put(); + } + + template + Body copy_function_object() { + input_body &body_ref = *this->my_body; + return dynamic_cast< input_body_leaf & >(body_ref).get_body(); + } + +protected: + + //! resets the input_node to its initial state + void reset_node( reset_flags f) override { + my_active = false; + my_reserved = false; + my_has_cached_item = false; + + if(f & rf_clear_edges) my_successors.clear(); + if(f & rf_reset_bodies) { + input_body *tmp = my_init_body->clone(); + delete my_body; + my_body = tmp; + } + } + +private: + spin_mutex my_mutex; + bool my_active; + input_body *my_body; + input_body *my_init_body; + broadcast_cache< output_type > my_successors; + bool my_reserved; + bool my_has_cached_item; + output_type my_cached_item; + + // used by apply_body_bypass, can invoke body of node. + bool try_reserve_apply_body(output_type &v) { + spin_mutex::scoped_lock lock(my_mutex); + if ( my_reserved ) { + return false; + } + if ( !my_has_cached_item ) { + flow_control control; + + fgt_begin_body( my_body ); + + my_cached_item = (*my_body)(control); + my_has_cached_item = !control.is_pipeline_stopped; + + fgt_end_body( my_body ); + } + if ( my_has_cached_item ) { + v = my_cached_item; + my_reserved = true; + return true; + } else { + return false; + } + } + + graph_task* create_put_task() { + small_object_allocator allocator{}; + typedef input_node_task_bypass< input_node > task_type; + graph_task* t = allocator.new_object(my_graph, allocator, *this); + my_graph.reserve_wait(); + return t; + } + + //! Spawns a task that applies the body + void spawn_put( ) { + if(is_graph_active(this->my_graph)) { + spawn_in_graph_arena(this->my_graph, *create_put_task()); + } + } + + friend class input_node_task_bypass< input_node >; + //! Applies the body. Returning SUCCESSFULLY_ENQUEUED okay; forward_task_bypass will handle it. + graph_task* apply_body_bypass( ) { + output_type v; + if ( !try_reserve_apply_body(v) ) + return nullptr; + + graph_task *last_task = my_successors.try_put_task(v); + if ( last_task ) + try_consume(); + else + try_release(); + return last_task; + } +}; // class input_node + +//! Implements a function node that supports Input -> Output +template + __TBB_requires(std::default_initializable && + std::copy_constructible && + std::copy_constructible) +class function_node + : public graph_node + , public function_input< Input, Output, Policy, cache_aligned_allocator > + , public function_output +{ + typedef cache_aligned_allocator internals_allocator; + +public: + typedef Input input_type; + typedef Output output_type; + typedef function_input input_impl_type; + typedef function_input_queue input_queue_type; + typedef function_output fOutput_type; + typedef typename input_impl_type::predecessor_type predecessor_type; + typedef typename fOutput_type::successor_type successor_type; + + using input_impl_type::my_predecessors; + + //! Constructor + // input_queue_type is allocated here, but destroyed in the function_input_base. + // TODO: pass the graph_buffer_policy to the function_input_base so it can all + // be done in one place. This would be an interface-breaking change. + template< typename Body > + __TBB_requires(function_node_body) + __TBB_NOINLINE_SYM function_node( graph &g, size_t concurrency, + Body body, Policy = Policy(), node_priority_t a_priority = no_priority ) + : graph_node(g), input_impl_type(g, concurrency, body, a_priority), + fOutput_type(g) { + fgt_node_with_body( CODEPTR(), FLOW_FUNCTION_NODE, &this->my_graph, + static_cast *>(this), static_cast *>(this), this->my_body ); + } + + template + __TBB_requires(function_node_body) + function_node( graph& g, size_t concurrency, Body body, node_priority_t a_priority ) + : function_node(g, concurrency, body, Policy(), a_priority) {} + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + __TBB_requires(function_node_body) + function_node( const node_set& nodes, size_t concurrency, Body body, + Policy p = Policy(), node_priority_t a_priority = no_priority ) + : function_node(nodes.graph_reference(), concurrency, body, p, a_priority) { + make_edges_in_order(nodes, *this); + } + + template + __TBB_requires(function_node_body) + function_node( const node_set& nodes, size_t concurrency, Body body, node_priority_t a_priority ) + : function_node(nodes, concurrency, body, Policy(), a_priority) {} +#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + + //! Copy constructor + __TBB_NOINLINE_SYM function_node( const function_node& src ) : + graph_node(src.my_graph), + input_impl_type(src), + fOutput_type(src.my_graph) { + fgt_node_with_body( CODEPTR(), FLOW_FUNCTION_NODE, &this->my_graph, + static_cast *>(this), static_cast *>(this), this->my_body ); + } + +protected: + template< typename R, typename B > friend class run_and_put_task; + template friend class broadcast_cache; + template friend class round_robin_cache; + using input_impl_type::try_put_task; + + broadcast_cache &successors () override { return fOutput_type::my_successors; } + + void reset_node(reset_flags f) override { + input_impl_type::reset_function_input(f); + // TODO: use clear() instead. + if(f & rf_clear_edges) { + successors().clear(); + my_predecessors.clear(); + } + __TBB_ASSERT(!(f & rf_clear_edges) || successors().empty(), "function_node successors not empty"); + __TBB_ASSERT(this->my_predecessors.empty(), "function_node predecessors not empty"); + } + +}; // class function_node + +//! implements a function node that supports Input -> (set of outputs) +// Output is a tuple of output types. +template + __TBB_requires(std::default_initializable && + std::copy_constructible) +class multifunction_node : + public graph_node, + public multifunction_input + < + Input, + typename wrap_tuple_elements< + std::tuple_size::value, // #elements in tuple + multifunction_output, // wrap this around each element + Output // the tuple providing the types + >::type, + Policy, + cache_aligned_allocator + > +{ + typedef cache_aligned_allocator internals_allocator; + +protected: + static const int N = std::tuple_size::value; +public: + typedef Input input_type; + typedef null_type output_type; + typedef typename wrap_tuple_elements::type output_ports_type; + typedef multifunction_input< + input_type, output_ports_type, Policy, internals_allocator> input_impl_type; + typedef function_input_queue input_queue_type; +private: + using input_impl_type::my_predecessors; +public: + template + __TBB_requires(multifunction_node_body) + __TBB_NOINLINE_SYM multifunction_node( + graph &g, size_t concurrency, + Body body, Policy = Policy(), node_priority_t a_priority = no_priority + ) : graph_node(g), input_impl_type(g, concurrency, body, a_priority) { + fgt_multioutput_node_with_body( + CODEPTR(), FLOW_MULTIFUNCTION_NODE, + &this->my_graph, static_cast *>(this), + this->output_ports(), this->my_body + ); + } + + template + __TBB_requires(multifunction_node_body) + __TBB_NOINLINE_SYM multifunction_node(graph& g, size_t concurrency, Body body, node_priority_t a_priority) + : multifunction_node(g, concurrency, body, Policy(), a_priority) {} + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + __TBB_requires(multifunction_node_body) + __TBB_NOINLINE_SYM multifunction_node(const node_set& nodes, size_t concurrency, Body body, + Policy p = Policy(), node_priority_t a_priority = no_priority) + : multifunction_node(nodes.graph_reference(), concurrency, body, p, a_priority) { + make_edges_in_order(nodes, *this); + } + + template + __TBB_requires(multifunction_node_body) + __TBB_NOINLINE_SYM multifunction_node(const node_set& nodes, size_t concurrency, Body body, node_priority_t a_priority) + : multifunction_node(nodes, concurrency, body, Policy(), a_priority) {} +#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + + __TBB_NOINLINE_SYM multifunction_node( const multifunction_node &other) : + graph_node(other.my_graph), input_impl_type(other) { + fgt_multioutput_node_with_body( CODEPTR(), FLOW_MULTIFUNCTION_NODE, + &this->my_graph, static_cast *>(this), + this->output_ports(), this->my_body ); + } + + // all the guts are in multifunction_input... +protected: + void reset_node(reset_flags f) override { input_impl_type::reset(f); } +}; // multifunction_node + +//! split_node: accepts a tuple as input, forwards each element of the tuple to its +// successors. The node has unlimited concurrency, so it does not reject inputs. +template +class split_node : public graph_node, public receiver { + static const int N = std::tuple_size::value; + typedef receiver base_type; +public: + typedef TupleType input_type; + typedef typename wrap_tuple_elements< + N, // #elements in tuple + multifunction_output, // wrap this around each element + TupleType // the tuple providing the types + >::type output_ports_type; + + __TBB_NOINLINE_SYM explicit split_node(graph &g) + : graph_node(g), + my_output_ports(init_output_ports::call(g, my_output_ports)) + { + fgt_multioutput_node(CODEPTR(), FLOW_SPLIT_NODE, &this->my_graph, + static_cast *>(this), this->output_ports()); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + __TBB_NOINLINE_SYM split_node(const node_set& nodes) : split_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + __TBB_NOINLINE_SYM split_node(const split_node& other) + : graph_node(other.my_graph), base_type(other), + my_output_ports(init_output_ports::call(other.my_graph, my_output_ports)) + { + fgt_multioutput_node(CODEPTR(), FLOW_SPLIT_NODE, &this->my_graph, + static_cast *>(this), this->output_ports()); + } + + output_ports_type &output_ports() { return my_output_ports; } + +protected: + graph_task *try_put_task(const TupleType& t) override { + // Sending split messages in parallel is not justified, as overheads would prevail. + // Also, we do not have successors here. So we just tell the task returned here is successful. + return emit_element::emit_this(this->my_graph, t, output_ports()); + } + void reset_node(reset_flags f) override { + if (f & rf_clear_edges) + clear_element::clear_this(my_output_ports); + + __TBB_ASSERT(!(f & rf_clear_edges) || clear_element::this_empty(my_output_ports), "split_node reset failed"); + } + graph& graph_reference() const override { + return my_graph; + } + +private: + output_ports_type my_output_ports; +}; + +//! Implements an executable node that supports continue_msg -> Output +template > + __TBB_requires(std::copy_constructible) +class continue_node : public graph_node, public continue_input, + public function_output { +public: + typedef continue_msg input_type; + typedef Output output_type; + typedef continue_input input_impl_type; + typedef function_output fOutput_type; + typedef typename input_impl_type::predecessor_type predecessor_type; + typedef typename fOutput_type::successor_type successor_type; + + //! Constructor for executable node with continue_msg -> Output + template + __TBB_requires(continue_node_body) + __TBB_NOINLINE_SYM continue_node( + graph &g, + Body body, Policy = Policy(), node_priority_t a_priority = no_priority + ) : graph_node(g), input_impl_type( g, body, a_priority ), + fOutput_type(g) { + fgt_node_with_body( CODEPTR(), FLOW_CONTINUE_NODE, &this->my_graph, + + static_cast *>(this), + static_cast *>(this), this->my_body ); + } + + template + __TBB_requires(continue_node_body) + continue_node( graph& g, Body body, node_priority_t a_priority ) + : continue_node(g, body, Policy(), a_priority) {} + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + __TBB_requires(continue_node_body) + continue_node( const node_set& nodes, Body body, + Policy p = Policy(), node_priority_t a_priority = no_priority ) + : continue_node(nodes.graph_reference(), body, p, a_priority ) { + make_edges_in_order(nodes, *this); + } + template + __TBB_requires(continue_node_body) + continue_node( const node_set& nodes, Body body, node_priority_t a_priority) + : continue_node(nodes, body, Policy(), a_priority) {} +#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + + //! Constructor for executable node with continue_msg -> Output + template + __TBB_requires(continue_node_body) + __TBB_NOINLINE_SYM continue_node( + graph &g, int number_of_predecessors, + Body body, Policy = Policy(), node_priority_t a_priority = no_priority + ) : graph_node(g) + , input_impl_type(g, number_of_predecessors, body, a_priority), + fOutput_type(g) { + fgt_node_with_body( CODEPTR(), FLOW_CONTINUE_NODE, &this->my_graph, + static_cast *>(this), + static_cast *>(this), this->my_body ); + } + + template + __TBB_requires(continue_node_body) + continue_node( graph& g, int number_of_predecessors, Body body, node_priority_t a_priority) + : continue_node(g, number_of_predecessors, body, Policy(), a_priority) {} + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + __TBB_requires(continue_node_body) + continue_node( const node_set& nodes, int number_of_predecessors, + Body body, Policy p = Policy(), node_priority_t a_priority = no_priority ) + : continue_node(nodes.graph_reference(), number_of_predecessors, body, p, a_priority) { + make_edges_in_order(nodes, *this); + } + + template + __TBB_requires(continue_node_body) + continue_node( const node_set& nodes, int number_of_predecessors, + Body body, node_priority_t a_priority ) + : continue_node(nodes, number_of_predecessors, body, Policy(), a_priority) {} +#endif + + //! Copy constructor + __TBB_NOINLINE_SYM continue_node( const continue_node& src ) : + graph_node(src.my_graph), input_impl_type(src), + function_output(src.my_graph) { + fgt_node_with_body( CODEPTR(), FLOW_CONTINUE_NODE, &this->my_graph, + static_cast *>(this), + static_cast *>(this), this->my_body ); + } + +protected: + template< typename R, typename B > friend class run_and_put_task; + template friend class broadcast_cache; + template friend class round_robin_cache; + using input_impl_type::try_put_task; + broadcast_cache &successors () override { return fOutput_type::my_successors; } + + void reset_node(reset_flags f) override { + input_impl_type::reset_receiver(f); + if(f & rf_clear_edges)successors().clear(); + __TBB_ASSERT(!(f & rf_clear_edges) || successors().empty(), "continue_node not reset"); + } +}; // continue_node + +//! Forwards messages of type T to all successors +template +class broadcast_node : public graph_node, public receiver, public sender { +public: + typedef T input_type; + typedef T output_type; + typedef typename receiver::predecessor_type predecessor_type; + typedef typename sender::successor_type successor_type; +private: + broadcast_cache my_successors; +public: + + __TBB_NOINLINE_SYM explicit broadcast_node(graph& g) : graph_node(g), my_successors(this) { + fgt_node( CODEPTR(), FLOW_BROADCAST_NODE, &this->my_graph, + static_cast *>(this), static_cast *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + broadcast_node(const node_set& nodes) : broadcast_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + // Copy constructor + __TBB_NOINLINE_SYM broadcast_node( const broadcast_node& src ) : broadcast_node(src.my_graph) {} + + //! Adds a successor + bool register_successor( successor_type &r ) override { + my_successors.register_successor( r ); + return true; + } + + //! Removes s as a successor + bool remove_successor( successor_type &r ) override { + my_successors.remove_successor( r ); + return true; + } + +protected: + template< typename R, typename B > friend class run_and_put_task; + template friend class broadcast_cache; + template friend class round_robin_cache; + //! build a task to run the successor if possible. Default is old behavior. + graph_task *try_put_task(const T& t) override { + graph_task *new_task = my_successors.try_put_task(t); + if (!new_task) new_task = SUCCESSFULLY_ENQUEUED; + return new_task; + } + + graph& graph_reference() const override { + return my_graph; + } + + void reset_node(reset_flags f) override { + if (f&rf_clear_edges) { + my_successors.clear(); + } + __TBB_ASSERT(!(f & rf_clear_edges) || my_successors.empty(), "Error resetting broadcast_node"); + } +}; // broadcast_node + +//! Forwards messages in arbitrary order +template +class buffer_node + : public graph_node + , public reservable_item_buffer< T, cache_aligned_allocator > + , public receiver, public sender +{ + typedef cache_aligned_allocator internals_allocator; + +public: + typedef T input_type; + typedef T output_type; + typedef typename receiver::predecessor_type predecessor_type; + typedef typename sender::successor_type successor_type; + typedef buffer_node class_type; + +protected: + typedef size_t size_type; + round_robin_cache< T, null_rw_mutex > my_successors; + + friend class forward_task_bypass< class_type >; + + enum op_type {reg_succ, rem_succ, req_item, res_item, rel_res, con_res, put_item, try_fwd_task + }; + + // implements the aggregator_operation concept + class buffer_operation : public aggregated_operation< buffer_operation > { + public: + char type; + T* elem; + graph_task* ltask; + successor_type *r; + + buffer_operation(const T& e, op_type t) : type(char(t)) + , elem(const_cast(&e)) , ltask(nullptr) + , r(nullptr) + {} + buffer_operation(op_type t) : type(char(t)), elem(nullptr), ltask(nullptr), r(nullptr) {} + }; + + bool forwarder_busy; + typedef aggregating_functor handler_type; + friend class aggregating_functor; + aggregator< handler_type, buffer_operation> my_aggregator; + + virtual void handle_operations(buffer_operation *op_list) { + handle_operations_impl(op_list, this); + } + + template + void handle_operations_impl(buffer_operation *op_list, derived_type* derived) { + __TBB_ASSERT(static_cast(derived) == this, "'this' is not a base class for derived"); + + buffer_operation *tmp = nullptr; + bool try_forwarding = false; + while (op_list) { + tmp = op_list; + op_list = op_list->next; + switch (tmp->type) { + case reg_succ: internal_reg_succ(tmp); try_forwarding = true; break; + case rem_succ: internal_rem_succ(tmp); break; + case req_item: internal_pop(tmp); break; + case res_item: internal_reserve(tmp); break; + case rel_res: internal_release(tmp); try_forwarding = true; break; + case con_res: internal_consume(tmp); try_forwarding = true; break; + case put_item: try_forwarding = internal_push(tmp); break; + case try_fwd_task: internal_forward_task(tmp); break; + } + } + + derived->order(); + + if (try_forwarding && !forwarder_busy) { + if(is_graph_active(this->my_graph)) { + forwarder_busy = true; + typedef forward_task_bypass task_type; + small_object_allocator allocator{}; + graph_task* new_task = allocator.new_object(graph_reference(), allocator, *this); + my_graph.reserve_wait(); + // tmp should point to the last item handled by the aggregator. This is the operation + // the handling thread enqueued. So modifying that record will be okay. + // TODO revamp: check that the issue is still present + // workaround for icc bug (at least 12.0 and 13.0) + // error: function "tbb::flow::interfaceX::combine_tasks" cannot be called with the given argument list + // argument types are: (graph, graph_task *, graph_task *) + graph_task *z = tmp->ltask; + graph &g = this->my_graph; + tmp->ltask = combine_tasks(g, z, new_task); // in case the op generated a task + } + } + } // handle_operations + + inline graph_task *grab_forwarding_task( buffer_operation &op_data) { + return op_data.ltask; + } + + inline bool enqueue_forwarding_task(buffer_operation &op_data) { + graph_task *ft = grab_forwarding_task(op_data); + if(ft) { + spawn_in_graph_arena(graph_reference(), *ft); + return true; + } + return false; + } + + //! This is executed by an enqueued task, the "forwarder" + virtual graph_task *forward_task() { + buffer_operation op_data(try_fwd_task); + graph_task *last_task = nullptr; + do { + op_data.status = WAIT; + op_data.ltask = nullptr; + my_aggregator.execute(&op_data); + + // workaround for icc bug + graph_task *xtask = op_data.ltask; + graph& g = this->my_graph; + last_task = combine_tasks(g, last_task, xtask); + } while (op_data.status ==SUCCEEDED); + return last_task; + } + + //! Register successor + virtual void internal_reg_succ(buffer_operation *op) { + __TBB_ASSERT(op->r, nullptr); + my_successors.register_successor(*(op->r)); + op->status.store(SUCCEEDED, std::memory_order_release); + } + + //! Remove successor + virtual void internal_rem_succ(buffer_operation *op) { + __TBB_ASSERT(op->r, nullptr); + my_successors.remove_successor(*(op->r)); + op->status.store(SUCCEEDED, std::memory_order_release); + } + +private: + void order() {} + + bool is_item_valid() { + return this->my_item_valid(this->my_tail - 1); + } + + void try_put_and_add_task(graph_task*& last_task) { + graph_task *new_task = my_successors.try_put_task(this->back()); + if (new_task) { + // workaround for icc bug + graph& g = this->my_graph; + last_task = combine_tasks(g, last_task, new_task); + this->destroy_back(); + } + } + +protected: + //! Tries to forward valid items to successors + virtual void internal_forward_task(buffer_operation *op) { + internal_forward_task_impl(op, this); + } + + template + void internal_forward_task_impl(buffer_operation *op, derived_type* derived) { + __TBB_ASSERT(static_cast(derived) == this, "'this' is not a base class for derived"); + + if (this->my_reserved || !derived->is_item_valid()) { + op->status.store(FAILED, std::memory_order_release); + this->forwarder_busy = false; + return; + } + // Try forwarding, giving each successor a chance + graph_task* last_task = nullptr; + size_type counter = my_successors.size(); + for (; counter > 0 && derived->is_item_valid(); --counter) + derived->try_put_and_add_task(last_task); + + op->ltask = last_task; // return task + if (last_task && !counter) { + op->status.store(SUCCEEDED, std::memory_order_release); + } + else { + op->status.store(FAILED, std::memory_order_release); + forwarder_busy = false; + } + } + + virtual bool internal_push(buffer_operation *op) { + __TBB_ASSERT(op->elem, nullptr); + this->push_back(*(op->elem)); + op->status.store(SUCCEEDED, std::memory_order_release); + return true; + } + + virtual void internal_pop(buffer_operation *op) { + __TBB_ASSERT(op->elem, nullptr); + if(this->pop_back(*(op->elem))) { + op->status.store(SUCCEEDED, std::memory_order_release); + } + else { + op->status.store(FAILED, std::memory_order_release); + } + } + + virtual void internal_reserve(buffer_operation *op) { + __TBB_ASSERT(op->elem, nullptr); + if(this->reserve_front(*(op->elem))) { + op->status.store(SUCCEEDED, std::memory_order_release); + } + else { + op->status.store(FAILED, std::memory_order_release); + } + } + + virtual void internal_consume(buffer_operation *op) { + this->consume_front(); + op->status.store(SUCCEEDED, std::memory_order_release); + } + + virtual void internal_release(buffer_operation *op) { + this->release_front(); + op->status.store(SUCCEEDED, std::memory_order_release); + } + +public: + //! Constructor + __TBB_NOINLINE_SYM explicit buffer_node( graph &g ) + : graph_node(g), reservable_item_buffer(), receiver(), + sender(), my_successors(this), forwarder_busy(false) + { + my_aggregator.initialize_handler(handler_type(this)); + fgt_node( CODEPTR(), FLOW_BUFFER_NODE, &this->my_graph, + static_cast *>(this), static_cast *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + buffer_node(const node_set& nodes) : buffer_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + //! Copy constructor + __TBB_NOINLINE_SYM buffer_node( const buffer_node& src ) : buffer_node(src.my_graph) {} + + // + // message sender implementation + // + + //! Adds a new successor. + /** Adds successor r to the list of successors; may forward tasks. */ + bool register_successor( successor_type &r ) override { + buffer_operation op_data(reg_succ); + op_data.r = &r; + my_aggregator.execute(&op_data); + (void)enqueue_forwarding_task(op_data); + return true; + } + + //! Removes a successor. + /** Removes successor r from the list of successors. + It also calls r.remove_predecessor(*this) to remove this node as a predecessor. */ + bool remove_successor( successor_type &r ) override { + // TODO revamp: investigate why full qualification is necessary here + tbb::detail::d1::remove_predecessor(r, *this); + buffer_operation op_data(rem_succ); + op_data.r = &r; + my_aggregator.execute(&op_data); + // even though this operation does not cause a forward, if we are the handler, and + // a forward is scheduled, we may be the first to reach this point after the aggregator, + // and so should check for the task. + (void)enqueue_forwarding_task(op_data); + return true; + } + + //! Request an item from the buffer_node + /** true = v contains the returned item
+ false = no item has been returned */ + bool try_get( T &v ) override { + buffer_operation op_data(req_item); + op_data.elem = &v; + my_aggregator.execute(&op_data); + (void)enqueue_forwarding_task(op_data); + return (op_data.status==SUCCEEDED); + } + + //! Reserves an item. + /** false = no item can be reserved
+ true = an item is reserved */ + bool try_reserve( T &v ) override { + buffer_operation op_data(res_item); + op_data.elem = &v; + my_aggregator.execute(&op_data); + (void)enqueue_forwarding_task(op_data); + return (op_data.status==SUCCEEDED); + } + + //! Release a reserved item. + /** true = item has been released and so remains in sender */ + bool try_release() override { + buffer_operation op_data(rel_res); + my_aggregator.execute(&op_data); + (void)enqueue_forwarding_task(op_data); + return true; + } + + //! Consumes a reserved item. + /** true = item is removed from sender and reservation removed */ + bool try_consume() override { + buffer_operation op_data(con_res); + my_aggregator.execute(&op_data); + (void)enqueue_forwarding_task(op_data); + return true; + } + +protected: + + template< typename R, typename B > friend class run_and_put_task; + template friend class broadcast_cache; + template friend class round_robin_cache; + //! receive an item, return a task *if possible + graph_task *try_put_task(const T &t) override { + buffer_operation op_data(t, put_item); + my_aggregator.execute(&op_data); + graph_task *ft = grab_forwarding_task(op_data); + // sequencer_nodes can return failure (if an item has been previously inserted) + // We have to spawn the returned task if our own operation fails. + + if(ft && op_data.status ==FAILED) { + // we haven't succeeded queueing the item, but for some reason the + // call returned a task (if another request resulted in a successful + // forward this could happen.) Queue the task and reset the pointer. + spawn_in_graph_arena(graph_reference(), *ft); ft = nullptr; + } + else if(!ft && op_data.status ==SUCCEEDED) { + ft = SUCCESSFULLY_ENQUEUED; + } + return ft; + } + + graph& graph_reference() const override { + return my_graph; + } + +protected: + void reset_node( reset_flags f) override { + reservable_item_buffer::reset(); + // TODO: just clear structures + if (f&rf_clear_edges) { + my_successors.clear(); + } + forwarder_busy = false; + } +}; // buffer_node + +//! Forwards messages in FIFO order +template +class queue_node : public buffer_node { +protected: + typedef buffer_node base_type; + typedef typename base_type::size_type size_type; + typedef typename base_type::buffer_operation queue_operation; + typedef queue_node class_type; + +private: + template friend class buffer_node; + + bool is_item_valid() { + return this->my_item_valid(this->my_head); + } + + void try_put_and_add_task(graph_task*& last_task) { + graph_task *new_task = this->my_successors.try_put_task(this->front()); + if (new_task) { + // workaround for icc bug + graph& graph_ref = this->graph_reference(); + last_task = combine_tasks(graph_ref, last_task, new_task); + this->destroy_front(); + } + } + +protected: + void internal_forward_task(queue_operation *op) override { + this->internal_forward_task_impl(op, this); + } + + void internal_pop(queue_operation *op) override { + if ( this->my_reserved || !this->my_item_valid(this->my_head)){ + op->status.store(FAILED, std::memory_order_release); + } + else { + this->pop_front(*(op->elem)); + op->status.store(SUCCEEDED, std::memory_order_release); + } + } + void internal_reserve(queue_operation *op) override { + if (this->my_reserved || !this->my_item_valid(this->my_head)) { + op->status.store(FAILED, std::memory_order_release); + } + else { + this->reserve_front(*(op->elem)); + op->status.store(SUCCEEDED, std::memory_order_release); + } + } + void internal_consume(queue_operation *op) override { + this->consume_front(); + op->status.store(SUCCEEDED, std::memory_order_release); + } + +public: + typedef T input_type; + typedef T output_type; + typedef typename receiver::predecessor_type predecessor_type; + typedef typename sender::successor_type successor_type; + + //! Constructor + __TBB_NOINLINE_SYM explicit queue_node( graph &g ) : base_type(g) { + fgt_node( CODEPTR(), FLOW_QUEUE_NODE, &(this->my_graph), + static_cast *>(this), + static_cast *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + queue_node( const node_set& nodes) : queue_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + //! Copy constructor + __TBB_NOINLINE_SYM queue_node( const queue_node& src) : base_type(src) { + fgt_node( CODEPTR(), FLOW_QUEUE_NODE, &(this->my_graph), + static_cast *>(this), + static_cast *>(this) ); + } + + +protected: + void reset_node( reset_flags f) override { + base_type::reset_node(f); + } +}; // queue_node + +//! Forwards messages in sequence order +template + __TBB_requires(std::copyable) +class sequencer_node : public queue_node { + function_body< T, size_t > *my_sequencer; + // my_sequencer should be a benign function and must be callable + // from a parallel context. Does this mean it needn't be reset? +public: + typedef T input_type; + typedef T output_type; + typedef typename receiver::predecessor_type predecessor_type; + typedef typename sender::successor_type successor_type; + + //! Constructor + template< typename Sequencer > + __TBB_requires(sequencer) + __TBB_NOINLINE_SYM sequencer_node( graph &g, const Sequencer& s ) : queue_node(g), + my_sequencer(new function_body_leaf< T, size_t, Sequencer>(s) ) { + fgt_node( CODEPTR(), FLOW_SEQUENCER_NODE, &(this->my_graph), + static_cast *>(this), + static_cast *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + __TBB_requires(sequencer) + sequencer_node( const node_set& nodes, const Sequencer& s) + : sequencer_node(nodes.graph_reference(), s) { + make_edges_in_order(nodes, *this); + } +#endif + + //! Copy constructor + __TBB_NOINLINE_SYM sequencer_node( const sequencer_node& src ) : queue_node(src), + my_sequencer( src.my_sequencer->clone() ) { + fgt_node( CODEPTR(), FLOW_SEQUENCER_NODE, &(this->my_graph), + static_cast *>(this), + static_cast *>(this) ); + } + + //! Destructor + ~sequencer_node() { delete my_sequencer; } + +protected: + typedef typename buffer_node::size_type size_type; + typedef typename buffer_node::buffer_operation sequencer_operation; + +private: + bool internal_push(sequencer_operation *op) override { + size_type tag = (*my_sequencer)(*(op->elem)); +#if !TBB_DEPRECATED_SEQUENCER_DUPLICATES + if (tag < this->my_head) { + // have already emitted a message with this tag + op->status.store(FAILED, std::memory_order_release); + return false; + } +#endif + // cannot modify this->my_tail now; the buffer would be inconsistent. + size_t new_tail = (tag+1 > this->my_tail) ? tag+1 : this->my_tail; + + if (this->size(new_tail) > this->capacity()) { + this->grow_my_array(this->size(new_tail)); + } + this->my_tail = new_tail; + + const op_stat res = this->place_item(tag, *(op->elem)) ? SUCCEEDED : FAILED; + op->status.store(res, std::memory_order_release); + return res ==SUCCEEDED; + } +}; // sequencer_node + +//! Forwards messages in priority order +template> +class priority_queue_node : public buffer_node { +public: + typedef T input_type; + typedef T output_type; + typedef buffer_node base_type; + typedef priority_queue_node class_type; + typedef typename receiver::predecessor_type predecessor_type; + typedef typename sender::successor_type successor_type; + + //! Constructor + __TBB_NOINLINE_SYM explicit priority_queue_node( graph &g, const Compare& comp = Compare() ) + : buffer_node(g), compare(comp), mark(0) { + fgt_node( CODEPTR(), FLOW_PRIORITY_QUEUE_NODE, &(this->my_graph), + static_cast *>(this), + static_cast *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + priority_queue_node(const node_set& nodes, const Compare& comp = Compare()) + : priority_queue_node(nodes.graph_reference(), comp) { + make_edges_in_order(nodes, *this); + } +#endif + + //! Copy constructor + __TBB_NOINLINE_SYM priority_queue_node( const priority_queue_node &src ) + : buffer_node(src), mark(0) + { + fgt_node( CODEPTR(), FLOW_PRIORITY_QUEUE_NODE, &(this->my_graph), + static_cast *>(this), + static_cast *>(this) ); + } + +protected: + + void reset_node( reset_flags f) override { + mark = 0; + base_type::reset_node(f); + } + + typedef typename buffer_node::size_type size_type; + typedef typename buffer_node::item_type item_type; + typedef typename buffer_node::buffer_operation prio_operation; + + //! Tries to forward valid items to successors + void internal_forward_task(prio_operation *op) override { + this->internal_forward_task_impl(op, this); + } + + void handle_operations(prio_operation *op_list) override { + this->handle_operations_impl(op_list, this); + } + + bool internal_push(prio_operation *op) override { + prio_push(*(op->elem)); + op->status.store(SUCCEEDED, std::memory_order_release); + return true; + } + + void internal_pop(prio_operation *op) override { + // if empty or already reserved, don't pop + if ( this->my_reserved == true || this->my_tail == 0 ) { + op->status.store(FAILED, std::memory_order_release); + return; + } + + *(op->elem) = prio(); + op->status.store(SUCCEEDED, std::memory_order_release); + prio_pop(); + + } + + // pops the highest-priority item, saves copy + void internal_reserve(prio_operation *op) override { + if (this->my_reserved == true || this->my_tail == 0) { + op->status.store(FAILED, std::memory_order_release); + return; + } + this->my_reserved = true; + *(op->elem) = prio(); + reserved_item = *(op->elem); + op->status.store(SUCCEEDED, std::memory_order_release); + prio_pop(); + } + + void internal_consume(prio_operation *op) override { + op->status.store(SUCCEEDED, std::memory_order_release); + this->my_reserved = false; + reserved_item = input_type(); + } + + void internal_release(prio_operation *op) override { + op->status.store(SUCCEEDED, std::memory_order_release); + prio_push(reserved_item); + this->my_reserved = false; + reserved_item = input_type(); + } + +private: + template friend class buffer_node; + + void order() { + if (mark < this->my_tail) heapify(); + __TBB_ASSERT(mark == this->my_tail, "mark unequal after heapify"); + } + + bool is_item_valid() { + return this->my_tail > 0; + } + + void try_put_and_add_task(graph_task*& last_task) { + graph_task * new_task = this->my_successors.try_put_task(this->prio()); + if (new_task) { + // workaround for icc bug + graph& graph_ref = this->graph_reference(); + last_task = combine_tasks(graph_ref, last_task, new_task); + prio_pop(); + } + } + +private: + Compare compare; + size_type mark; + + input_type reserved_item; + + // in case a reheap has not been done after a push, check if the mark item is higher than the 0'th item + bool prio_use_tail() { + __TBB_ASSERT(mark <= this->my_tail, "mark outside bounds before test"); + return mark < this->my_tail && compare(this->get_my_item(0), this->get_my_item(this->my_tail - 1)); + } + + // prio_push: checks that the item will fit, expand array if necessary, put at end + void prio_push(const T &src) { + if ( this->my_tail >= this->my_array_size ) + this->grow_my_array( this->my_tail + 1 ); + (void) this->place_item(this->my_tail, src); + ++(this->my_tail); + __TBB_ASSERT(mark < this->my_tail, "mark outside bounds after push"); + } + + // prio_pop: deletes highest priority item from the array, and if it is item + // 0, move last item to 0 and reheap. If end of array, just destroy and decrement tail + // and mark. Assumes the array has already been tested for emptiness; no failure. + void prio_pop() { + if (prio_use_tail()) { + // there are newly pushed elements; last one higher than top + // copy the data + this->destroy_item(this->my_tail-1); + --(this->my_tail); + __TBB_ASSERT(mark <= this->my_tail, "mark outside bounds after pop"); + return; + } + this->destroy_item(0); + if(this->my_tail > 1) { + // push the last element down heap + __TBB_ASSERT(this->my_item_valid(this->my_tail - 1), nullptr); + this->move_item(0,this->my_tail - 1); + } + --(this->my_tail); + if(mark > this->my_tail) --mark; + if (this->my_tail > 1) // don't reheap for heap of size 1 + reheap(); + __TBB_ASSERT(mark <= this->my_tail, "mark outside bounds after pop"); + } + + const T& prio() { + return this->get_my_item(prio_use_tail() ? this->my_tail-1 : 0); + } + + // turn array into heap + void heapify() { + if(this->my_tail == 0) { + mark = 0; + return; + } + if (!mark) mark = 1; + for (; markmy_tail; ++mark) { // for each unheaped element + size_type cur_pos = mark; + input_type to_place; + this->fetch_item(mark,to_place); + do { // push to_place up the heap + size_type parent = (cur_pos-1)>>1; + if (!compare(this->get_my_item(parent), to_place)) + break; + this->move_item(cur_pos, parent); + cur_pos = parent; + } while( cur_pos ); + (void) this->place_item(cur_pos, to_place); + } + } + + // otherwise heapified array with new root element; rearrange to heap + void reheap() { + size_type cur_pos=0, child=1; + while (child < mark) { + size_type target = child; + if (child+1get_my_item(child), + this->get_my_item(child+1))) + ++target; + // target now has the higher priority child + if (compare(this->get_my_item(target), + this->get_my_item(cur_pos))) + break; + // swap + this->swap_items(cur_pos, target); + cur_pos = target; + child = (cur_pos<<1)+1; + } + } +}; // priority_queue_node + +//! Forwards messages only if the threshold has not been reached +/** This node forwards items until its threshold is reached. + It contains no buffering. If the downstream node rejects, the + message is dropped. */ +template< typename T, typename DecrementType=continue_msg > +class limiter_node : public graph_node, public receiver< T >, public sender< T > { +public: + typedef T input_type; + typedef T output_type; + typedef typename receiver::predecessor_type predecessor_type; + typedef typename sender::successor_type successor_type; + //TODO: There is a lack of predefined types for its controlling "decrementer" port. It should be fixed later. + +private: + size_t my_threshold; + size_t my_count; // number of successful puts + size_t my_tries; // number of active put attempts + size_t my_future_decrement; // number of active decrement + reservable_predecessor_cache< T, spin_mutex > my_predecessors; + spin_mutex my_mutex; + broadcast_cache< T > my_successors; + + //! The internal receiver< DecrementType > that adjusts the count + threshold_regulator< limiter_node, DecrementType > decrement; + + graph_task* decrement_counter( long long delta ) { + if ( delta > 0 && size_t(delta) > my_threshold ) { + delta = my_threshold; + } + + { + spin_mutex::scoped_lock lock(my_mutex); + if ( delta > 0 && size_t(delta) > my_count ) { + if( my_tries > 0 ) { + my_future_decrement += (size_t(delta) - my_count); + } + my_count = 0; + } + else if ( delta < 0 && size_t(-delta) > my_threshold - my_count ) { + my_count = my_threshold; + } + else { + my_count -= size_t(delta); // absolute value of delta is sufficiently small + } + __TBB_ASSERT(my_count <= my_threshold, "counter values are truncated to be inside the [0, threshold] interval"); + } + return forward_task(); + } + + // Let threshold_regulator call decrement_counter() + friend class threshold_regulator< limiter_node, DecrementType >; + + friend class forward_task_bypass< limiter_node >; + + bool check_conditions() { // always called under lock + return ( my_count + my_tries < my_threshold && !my_predecessors.empty() && !my_successors.empty() ); + } + + // only returns a valid task pointer or nullptr, never SUCCESSFULLY_ENQUEUED + graph_task* forward_task() { + input_type v; + graph_task* rval = nullptr; + bool reserved = false; + + { + spin_mutex::scoped_lock lock(my_mutex); + if ( check_conditions() ) + ++my_tries; + else + return nullptr; + } + + //SUCCESS + // if we can reserve and can put, we consume the reservation + // we increment the count and decrement the tries + if ( (my_predecessors.try_reserve(v)) == true ) { + reserved = true; + if ( (rval = my_successors.try_put_task(v)) != nullptr ) { + { + spin_mutex::scoped_lock lock(my_mutex); + ++my_count; + if ( my_future_decrement ) { + if ( my_count > my_future_decrement ) { + my_count -= my_future_decrement; + my_future_decrement = 0; + } + else { + my_future_decrement -= my_count; + my_count = 0; + } + } + --my_tries; + my_predecessors.try_consume(); + if ( check_conditions() ) { + if ( is_graph_active(this->my_graph) ) { + typedef forward_task_bypass> task_type; + small_object_allocator allocator{}; + graph_task* rtask = allocator.new_object( my_graph, allocator, *this ); + my_graph.reserve_wait(); + spawn_in_graph_arena(graph_reference(), *rtask); + } + } + } + return rval; + } + } + //FAILURE + //if we can't reserve, we decrement the tries + //if we can reserve but can't put, we decrement the tries and release the reservation + { + spin_mutex::scoped_lock lock(my_mutex); + --my_tries; + if (reserved) my_predecessors.try_release(); + if ( check_conditions() ) { + if ( is_graph_active(this->my_graph) ) { + small_object_allocator allocator{}; + typedef forward_task_bypass> task_type; + graph_task* t = allocator.new_object(my_graph, allocator, *this); + my_graph.reserve_wait(); + __TBB_ASSERT(!rval, "Have two tasks to handle"); + return t; + } + } + return rval; + } + } + + void initialize() { + fgt_node( + CODEPTR(), FLOW_LIMITER_NODE, &this->my_graph, + static_cast *>(this), static_cast *>(&decrement), + static_cast *>(this) + ); + } + +public: + //! Constructor + limiter_node(graph &g, size_t threshold) + : graph_node(g), my_threshold(threshold), my_count(0), my_tries(0), my_future_decrement(0), + my_predecessors(this), my_successors(this), decrement(this) + { + initialize(); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + limiter_node(const node_set& nodes, size_t threshold) + : limiter_node(nodes.graph_reference(), threshold) { + make_edges_in_order(nodes, *this); + } +#endif + + //! Copy constructor + limiter_node( const limiter_node& src ) : limiter_node(src.my_graph, src.my_threshold) {} + + //! The interface for accessing internal receiver< DecrementType > that adjusts the count + receiver& decrementer() { return decrement; } + + //! Replace the current successor with this new successor + bool register_successor( successor_type &r ) override { + spin_mutex::scoped_lock lock(my_mutex); + bool was_empty = my_successors.empty(); + my_successors.register_successor(r); + //spawn a forward task if this is the only successor + if ( was_empty && !my_predecessors.empty() && my_count + my_tries < my_threshold ) { + if ( is_graph_active(this->my_graph) ) { + small_object_allocator allocator{}; + typedef forward_task_bypass> task_type; + graph_task* t = allocator.new_object(my_graph, allocator, *this); + my_graph.reserve_wait(); + spawn_in_graph_arena(graph_reference(), *t); + } + } + return true; + } + + //! Removes a successor from this node + /** r.remove_predecessor(*this) is also called. */ + bool remove_successor( successor_type &r ) override { + // TODO revamp: investigate why qualification is needed for remove_predecessor() call + tbb::detail::d1::remove_predecessor(r, *this); + my_successors.remove_successor(r); + return true; + } + + //! Adds src to the list of cached predecessors. + bool register_predecessor( predecessor_type &src ) override { + spin_mutex::scoped_lock lock(my_mutex); + my_predecessors.add( src ); + if ( my_count + my_tries < my_threshold && !my_successors.empty() && is_graph_active(this->my_graph) ) { + small_object_allocator allocator{}; + typedef forward_task_bypass> task_type; + graph_task* t = allocator.new_object(my_graph, allocator, *this); + my_graph.reserve_wait(); + spawn_in_graph_arena(graph_reference(), *t); + } + return true; + } + + //! Removes src from the list of cached predecessors. + bool remove_predecessor( predecessor_type &src ) override { + my_predecessors.remove( src ); + return true; + } + +protected: + + template< typename R, typename B > friend class run_and_put_task; + template friend class broadcast_cache; + template friend class round_robin_cache; + //! Puts an item to this receiver + graph_task* try_put_task( const T &t ) override { + { + spin_mutex::scoped_lock lock(my_mutex); + if ( my_count + my_tries >= my_threshold ) + return nullptr; + else + ++my_tries; + } + + graph_task* rtask = my_successors.try_put_task(t); + if ( !rtask ) { // try_put_task failed. + spin_mutex::scoped_lock lock(my_mutex); + --my_tries; + if (check_conditions() && is_graph_active(this->my_graph)) { + small_object_allocator allocator{}; + typedef forward_task_bypass> task_type; + rtask = allocator.new_object(my_graph, allocator, *this); + my_graph.reserve_wait(); + } + } + else { + spin_mutex::scoped_lock lock(my_mutex); + ++my_count; + if ( my_future_decrement ) { + if ( my_count > my_future_decrement ) { + my_count -= my_future_decrement; + my_future_decrement = 0; + } + else { + my_future_decrement -= my_count; + my_count = 0; + } + } + --my_tries; + } + return rtask; + } + + graph& graph_reference() const override { return my_graph; } + + void reset_node( reset_flags f ) override { + my_count = 0; + if ( f & rf_clear_edges ) { + my_predecessors.clear(); + my_successors.clear(); + } + else { + my_predecessors.reset(); + } + decrement.reset_receiver(f); + } +}; // limiter_node + +#include "third_party/tbb/detail/_flow_graph_join_impl.h" + +template class join_node; + +template +class join_node: public unfolded_join_node::value, reserving_port, OutputTuple, reserving> { +private: + static const int N = std::tuple_size::value; + typedef unfolded_join_node unfolded_type; +public: + typedef OutputTuple output_type; + typedef typename unfolded_type::input_ports_type input_ports_type; + __TBB_NOINLINE_SYM explicit join_node(graph &g) : unfolded_type(g) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_RESERVING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + __TBB_NOINLINE_SYM join_node(const node_set& nodes, reserving = reserving()) : join_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + __TBB_NOINLINE_SYM join_node(const join_node &other) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_RESERVING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +}; + +template +class join_node: public unfolded_join_node::value, queueing_port, OutputTuple, queueing> { +private: + static const int N = std::tuple_size::value; + typedef unfolded_join_node unfolded_type; +public: + typedef OutputTuple output_type; + typedef typename unfolded_type::input_ports_type input_ports_type; + __TBB_NOINLINE_SYM explicit join_node(graph &g) : unfolded_type(g) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_QUEUEING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + __TBB_NOINLINE_SYM join_node(const node_set& nodes, queueing = queueing()) : join_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + __TBB_NOINLINE_SYM join_node(const join_node &other) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_QUEUEING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +}; + +#if __TBB_CPP20_CONCEPTS_PRESENT +// Helper function which is well-formed only if all of the elements in OutputTuple +// satisfies join_node_function_object +template +void join_node_function_objects_helper( std::index_sequence ) + requires (std::tuple_size_v == sizeof...(Functions)) && + (... && join_node_function_object, K>); + +template +concept join_node_functions = requires { + join_node_function_objects_helper(std::make_index_sequence{}); +}; + +#endif + +// template for key_matching join_node +// tag_matching join_node is a specialization of key_matching, and is source-compatible. +template +class join_node > : public unfolded_join_node::value, + key_matching_port, OutputTuple, key_matching > { +private: + static const int N = std::tuple_size::value; + typedef unfolded_join_node > unfolded_type; +public: + typedef OutputTuple output_type; + typedef typename unfolded_type::input_ports_type input_ports_type; + +#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING + join_node(graph &g) : unfolded_type(g) {} +#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ + + template + __TBB_requires(join_node_functions) + __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1) : unfolded_type(g, b0, b1) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + template + __TBB_requires(join_node_functions) + __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2) : unfolded_type(g, b0, b1, b2) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + template + __TBB_requires(join_node_functions) + __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3) : unfolded_type(g, b0, b1, b2, b3) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + template + __TBB_requires(join_node_functions) + __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4) : + unfolded_type(g, b0, b1, b2, b3, b4) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } +#if __TBB_VARIADIC_MAX >= 6 + template + __TBB_requires(join_node_functions) + __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5) : + unfolded_type(g, b0, b1, b2, b3, b4, b5) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } +#endif +#if __TBB_VARIADIC_MAX >= 7 + template + __TBB_requires(join_node_functions) + __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6) : + unfolded_type(g, b0, b1, b2, b3, b4, b5, b6) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } +#endif +#if __TBB_VARIADIC_MAX >= 8 + template + __TBB_requires(join_node_functions) + __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6, + __TBB_B7 b7) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } +#endif +#if __TBB_VARIADIC_MAX >= 9 + template + __TBB_requires(join_node_functions) + __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6, + __TBB_B7 b7, __TBB_B8 b8) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7, b8) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } +#endif +#if __TBB_VARIADIC_MAX >= 10 + template + __TBB_requires(join_node_functions) + __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6, + __TBB_B7 b7, __TBB_B8 b8, __TBB_B9 b9) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } +#endif + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template < +#if (__clang_major__ == 3 && __clang_minor__ == 4) + // clang 3.4 misdeduces 'Args...' for 'node_set' while it can cope with template template parameter. + template class node_set, +#endif + typename... Args, typename... Bodies + > + __TBB_requires((sizeof...(Bodies) == 0) || join_node_functions) + __TBB_NOINLINE_SYM join_node(const node_set& nodes, Bodies... bodies) + : join_node(nodes.graph_reference(), bodies...) { + make_edges_in_order(nodes, *this); + } +#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + + __TBB_NOINLINE_SYM join_node(const join_node &other) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +}; + +// indexer node +#include "third_party/tbb/detail/_flow_graph_indexer_impl.h" + +// TODO: Implement interface with variadic template or tuple +template class indexer_node; + +//indexer node specializations +template +class indexer_node : public unfolded_indexer_node > { +private: + static const int N = 1; +public: + typedef std::tuple InputTuple; + typedef tagged_msg output_type; + typedef unfolded_indexer_node unfolded_type; + __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + // Copy constructor + __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } +}; + +template +class indexer_node : public unfolded_indexer_node > { +private: + static const int N = 2; +public: + typedef std::tuple InputTuple; + typedef tagged_msg output_type; + typedef unfolded_indexer_node unfolded_type; + __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + // Copy constructor + __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +}; + +template +class indexer_node : public unfolded_indexer_node > { +private: + static const int N = 3; +public: + typedef std::tuple InputTuple; + typedef tagged_msg output_type; + typedef unfolded_indexer_node unfolded_type; + __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + // Copy constructor + __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +}; + +template +class indexer_node : public unfolded_indexer_node > { +private: + static const int N = 4; +public: + typedef std::tuple InputTuple; + typedef tagged_msg output_type; + typedef unfolded_indexer_node unfolded_type; + __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + // Copy constructor + __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +}; + +template +class indexer_node : public unfolded_indexer_node > { +private: + static const int N = 5; +public: + typedef std::tuple InputTuple; + typedef tagged_msg output_type; + typedef unfolded_indexer_node unfolded_type; + __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + // Copy constructor + __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +}; + +#if __TBB_VARIADIC_MAX >= 6 +template +class indexer_node : public unfolded_indexer_node > { +private: + static const int N = 6; +public: + typedef std::tuple InputTuple; + typedef tagged_msg output_type; + typedef unfolded_indexer_node unfolded_type; + __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + // Copy constructor + __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +}; +#endif //variadic max 6 + +#if __TBB_VARIADIC_MAX >= 7 +template +class indexer_node : public unfolded_indexer_node > { +private: + static const int N = 7; +public: + typedef std::tuple InputTuple; + typedef tagged_msg output_type; + typedef unfolded_indexer_node unfolded_type; + __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + // Copy constructor + __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +}; +#endif //variadic max 7 + +#if __TBB_VARIADIC_MAX >= 8 +template +class indexer_node : public unfolded_indexer_node > { +private: + static const int N = 8; +public: + typedef std::tuple InputTuple; + typedef tagged_msg output_type; + typedef unfolded_indexer_node unfolded_type; + indexer_node(graph& g) : unfolded_type(g) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + // Copy constructor + indexer_node( const indexer_node& other ) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +}; +#endif //variadic max 8 + +#if __TBB_VARIADIC_MAX >= 9 +template +class indexer_node : public unfolded_indexer_node > { +private: + static const int N = 9; +public: + typedef std::tuple InputTuple; + typedef tagged_msg output_type; + typedef unfolded_indexer_node unfolded_type; + __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + // Copy constructor + __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +}; +#endif //variadic max 9 + +#if __TBB_VARIADIC_MAX >= 10 +template +class indexer_node/*default*/ : public unfolded_indexer_node > { +private: + static const int N = 10; +public: + typedef std::tuple InputTuple; + typedef tagged_msg output_type; + typedef unfolded_indexer_node unfolded_type; + __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + // Copy constructor + __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { + fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, + this->input_ports(), static_cast< sender< output_type > *>(this) ); + } + +}; +#endif //variadic max 10 + +template< typename T > +inline void internal_make_edge( sender &p, receiver &s ) { + register_successor(p, s); + fgt_make_edge( &p, &s ); +} + +//! Makes an edge between a single predecessor and a single successor +template< typename T > +inline void make_edge( sender &p, receiver &s ) { + internal_make_edge( p, s ); +} + +//Makes an edge from port 0 of a multi-output predecessor to port 0 of a multi-input successor. +template< typename T, typename V, + typename = typename T::output_ports_type, typename = typename V::input_ports_type > +inline void make_edge( T& output, V& input) { + make_edge(std::get<0>(output.output_ports()), std::get<0>(input.input_ports())); +} + +//Makes an edge from port 0 of a multi-output predecessor to a receiver. +template< typename T, typename R, + typename = typename T::output_ports_type > +inline void make_edge( T& output, receiver& input) { + make_edge(std::get<0>(output.output_ports()), input); +} + +//Makes an edge from a sender to port 0 of a multi-input successor. +template< typename S, typename V, + typename = typename V::input_ports_type > +inline void make_edge( sender& output, V& input) { + make_edge(output, std::get<0>(input.input_ports())); +} + +template< typename T > +inline void internal_remove_edge( sender &p, receiver &s ) { + remove_successor( p, s ); + fgt_remove_edge( &p, &s ); +} + +//! Removes an edge between a single predecessor and a single successor +template< typename T > +inline void remove_edge( sender &p, receiver &s ) { + internal_remove_edge( p, s ); +} + +//Removes an edge between port 0 of a multi-output predecessor and port 0 of a multi-input successor. +template< typename T, typename V, + typename = typename T::output_ports_type, typename = typename V::input_ports_type > +inline void remove_edge( T& output, V& input) { + remove_edge(std::get<0>(output.output_ports()), std::get<0>(input.input_ports())); +} + +//Removes an edge between port 0 of a multi-output predecessor and a receiver. +template< typename T, typename R, + typename = typename T::output_ports_type > +inline void remove_edge( T& output, receiver& input) { + remove_edge(std::get<0>(output.output_ports()), input); +} +//Removes an edge between a sender and port 0 of a multi-input successor. +template< typename S, typename V, + typename = typename V::input_ports_type > +inline void remove_edge( sender& output, V& input) { + remove_edge(output, std::get<0>(input.input_ports())); +} + +//! Returns a copy of the body from a function or continue node +template< typename Body, typename Node > +Body copy_body( Node &n ) { + return n.template copy_function_object(); +} + +//composite_node +template< typename InputTuple, typename OutputTuple > class composite_node; + +template< typename... InputTypes, typename... OutputTypes> +class composite_node , std::tuple > : public graph_node { + +public: + typedef std::tuple< receiver&... > input_ports_type; + typedef std::tuple< sender&... > output_ports_type; + +private: + std::unique_ptr my_input_ports; + std::unique_ptr my_output_ports; + + static const size_t NUM_INPUTS = sizeof...(InputTypes); + static const size_t NUM_OUTPUTS = sizeof...(OutputTypes); + +protected: + void reset_node(reset_flags) override {} + +public: + composite_node( graph &g ) : graph_node(g) { + fgt_multiinput_multioutput_node( CODEPTR(), FLOW_COMPOSITE_NODE, this, &this->my_graph ); + } + + template + void set_external_ports(T1&& input_ports_tuple, T2&& output_ports_tuple) { + static_assert(NUM_INPUTS == std::tuple_size::value, "number of arguments does not match number of input ports"); + static_assert(NUM_OUTPUTS == std::tuple_size::value, "number of arguments does not match number of output ports"); + + fgt_internal_input_alias_helper::alias_port( this, input_ports_tuple); + fgt_internal_output_alias_helper::alias_port( this, output_ports_tuple); + + my_input_ports.reset( new input_ports_type(std::forward(input_ports_tuple)) ); + my_output_ports.reset( new output_ports_type(std::forward(output_ports_tuple)) ); + } + + template< typename... NodeTypes > + void add_visible_nodes(const NodeTypes&... n) { add_nodes_impl(this, true, n...); } + + template< typename... NodeTypes > + void add_nodes(const NodeTypes&... n) { add_nodes_impl(this, false, n...); } + + + input_ports_type& input_ports() { + __TBB_ASSERT(my_input_ports, "input ports not set, call set_external_ports to set input ports"); + return *my_input_ports; + } + + output_ports_type& output_ports() { + __TBB_ASSERT(my_output_ports, "output ports not set, call set_external_ports to set output ports"); + return *my_output_ports; + } +}; // class composite_node + +//composite_node with only input ports +template< typename... InputTypes> +class composite_node , std::tuple<> > : public graph_node { +public: + typedef std::tuple< receiver&... > input_ports_type; + +private: + std::unique_ptr my_input_ports; + static const size_t NUM_INPUTS = sizeof...(InputTypes); + +protected: + void reset_node(reset_flags) override {} + +public: + composite_node( graph &g ) : graph_node(g) { + fgt_composite( CODEPTR(), this, &g ); + } + + template + void set_external_ports(T&& input_ports_tuple) { + static_assert(NUM_INPUTS == std::tuple_size::value, "number of arguments does not match number of input ports"); + + fgt_internal_input_alias_helper::alias_port( this, input_ports_tuple); + + my_input_ports.reset( new input_ports_type(std::forward(input_ports_tuple)) ); + } + + template< typename... NodeTypes > + void add_visible_nodes(const NodeTypes&... n) { add_nodes_impl(this, true, n...); } + + template< typename... NodeTypes > + void add_nodes( const NodeTypes&... n) { add_nodes_impl(this, false, n...); } + + + input_ports_type& input_ports() { + __TBB_ASSERT(my_input_ports, "input ports not set, call set_external_ports to set input ports"); + return *my_input_ports; + } + +}; // class composite_node + +//composite_nodes with only output_ports +template +class composite_node , std::tuple > : public graph_node { +public: + typedef std::tuple< sender&... > output_ports_type; + +private: + std::unique_ptr my_output_ports; + static const size_t NUM_OUTPUTS = sizeof...(OutputTypes); + +protected: + void reset_node(reset_flags) override {} + +public: + __TBB_NOINLINE_SYM composite_node( graph &g ) : graph_node(g) { + fgt_composite( CODEPTR(), this, &g ); + } + + template + void set_external_ports(T&& output_ports_tuple) { + static_assert(NUM_OUTPUTS == std::tuple_size::value, "number of arguments does not match number of output ports"); + + fgt_internal_output_alias_helper::alias_port( this, output_ports_tuple); + + my_output_ports.reset( new output_ports_type(std::forward(output_ports_tuple)) ); + } + + template + void add_visible_nodes(const NodeTypes&... n) { add_nodes_impl(this, true, n...); } + + template + void add_nodes(const NodeTypes&... n) { add_nodes_impl(this, false, n...); } + + + output_ports_type& output_ports() { + __TBB_ASSERT(my_output_ports, "output ports not set, call set_external_ports to set output ports"); + return *my_output_ports; + } + +}; // class composite_node + +template +class async_body_base: no_assign { +public: + typedef Gateway gateway_type; + + async_body_base(gateway_type *gateway): my_gateway(gateway) { } + void set_gateway(gateway_type *gateway) { + my_gateway = gateway; + } + +protected: + gateway_type *my_gateway; +}; + +template +class async_body: public async_body_base { +private: + Body my_body; + +public: + typedef async_body_base base_type; + typedef Gateway gateway_type; + + async_body(const Body &body, gateway_type *gateway) + : base_type(gateway), my_body(body) { } + + void operator()( const Input &v, Ports & ) noexcept(noexcept(tbb::detail::invoke(my_body, v, std::declval()))) { + tbb::detail::invoke(my_body, v, *this->my_gateway); + } + + Body get_body() { return my_body; } +}; + +//! Implements async node +template < typename Input, typename Output, + typename Policy = queueing_lightweight > + __TBB_requires(std::default_initializable && std::copy_constructible) +class async_node + : public multifunction_node< Input, std::tuple< Output >, Policy >, public sender< Output > +{ + typedef multifunction_node< Input, std::tuple< Output >, Policy > base_type; + typedef multifunction_input< + Input, typename base_type::output_ports_type, Policy, cache_aligned_allocator> mfn_input_type; + +public: + typedef Input input_type; + typedef Output output_type; + typedef receiver receiver_type; + typedef receiver successor_type; + typedef sender predecessor_type; + typedef receiver_gateway gateway_type; + typedef async_body_base async_body_base_type; + typedef typename base_type::output_ports_type output_ports_type; + +private: + class receiver_gateway_impl: public receiver_gateway { + public: + receiver_gateway_impl(async_node* node): my_node(node) {} + void reserve_wait() override { + fgt_async_reserve(static_cast(my_node), &my_node->my_graph); + my_node->my_graph.reserve_wait(); + } + + void release_wait() override { + async_node* n = my_node; + graph* g = &n->my_graph; + g->release_wait(); + fgt_async_commit(static_cast(n), g); + } + + //! Implements gateway_type::try_put for an external activity to submit a message to FG + bool try_put(const Output &i) override { + return my_node->try_put_impl(i); + } + + private: + async_node* my_node; + } my_gateway; + + //The substitute of 'this' for member construction, to prevent compiler warnings + async_node* self() { return this; } + + //! Implements gateway_type::try_put for an external activity to submit a message to FG + bool try_put_impl(const Output &i) { + multifunction_output &port_0 = output_port<0>(*this); + broadcast_cache& port_successors = port_0.successors(); + fgt_async_try_put_begin(this, &port_0); + // TODO revamp: change to std::list + graph_task_list tasks; + bool is_at_least_one_put_successful = port_successors.gather_successful_try_puts(i, tasks); + __TBB_ASSERT( is_at_least_one_put_successful || tasks.empty(), + "Return status is inconsistent with the method operation." ); + + while( !tasks.empty() ) { + enqueue_in_graph_arena(this->my_graph, tasks.pop_front()); + } + fgt_async_try_put_end(this, &port_0); + return is_at_least_one_put_successful; + } + +public: + template + __TBB_requires(async_node_body) + __TBB_NOINLINE_SYM async_node( + graph &g, size_t concurrency, + Body body, Policy = Policy(), node_priority_t a_priority = no_priority + ) : base_type( + g, concurrency, + async_body + (body, &my_gateway), a_priority ), my_gateway(self()) { + fgt_multioutput_node_with_body<1>( + CODEPTR(), FLOW_ASYNC_NODE, + &this->my_graph, static_cast *>(this), + this->output_ports(), this->my_body + ); + } + + template + __TBB_requires(async_node_body) + __TBB_NOINLINE_SYM async_node(graph& g, size_t concurrency, Body body, node_priority_t a_priority) + : async_node(g, concurrency, body, Policy(), a_priority) {} + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + __TBB_requires(async_node_body) + __TBB_NOINLINE_SYM async_node( + const node_set& nodes, size_t concurrency, Body body, + Policy = Policy(), node_priority_t a_priority = no_priority ) + : async_node(nodes.graph_reference(), concurrency, body, a_priority) { + make_edges_in_order(nodes, *this); + } + + template + __TBB_requires(async_node_body) + __TBB_NOINLINE_SYM async_node(const node_set& nodes, size_t concurrency, Body body, node_priority_t a_priority) + : async_node(nodes, concurrency, body, Policy(), a_priority) {} +#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + + __TBB_NOINLINE_SYM async_node( const async_node &other ) : base_type(other), sender(), my_gateway(self()) { + static_cast(this->my_body->get_body_ptr())->set_gateway(&my_gateway); + static_cast(this->my_init_body->get_body_ptr())->set_gateway(&my_gateway); + + fgt_multioutput_node_with_body<1>( CODEPTR(), FLOW_ASYNC_NODE, + &this->my_graph, static_cast *>(this), + this->output_ports(), this->my_body ); + } + + gateway_type& gateway() { + return my_gateway; + } + + // Define sender< Output > + + //! Add a new successor to this node + bool register_successor(successor_type&) override { + __TBB_ASSERT(false, "Successors must be registered only via ports"); + return false; + } + + //! Removes a successor from this node + bool remove_successor(successor_type&) override { + __TBB_ASSERT(false, "Successors must be removed only via ports"); + return false; + } + + template + Body copy_function_object() { + typedef multifunction_body mfn_body_type; + typedef async_body async_body_type; + mfn_body_type &body_ref = *this->my_body; + async_body_type ab = *static_cast(dynamic_cast< multifunction_body_leaf & >(body_ref).get_body_ptr()); + return ab.get_body(); + } + +protected: + + void reset_node( reset_flags f) override { + base_type::reset_node(f); + } +}; + +#include "third_party/tbb/detail/_flow_graph_node_set_impl.h" + +template< typename T > +class overwrite_node : public graph_node, public receiver, public sender { +public: + typedef T input_type; + typedef T output_type; + typedef typename receiver::predecessor_type predecessor_type; + typedef typename sender::successor_type successor_type; + + __TBB_NOINLINE_SYM explicit overwrite_node(graph &g) + : graph_node(g), my_successors(this), my_buffer_is_valid(false) + { + fgt_node( CODEPTR(), FLOW_OVERWRITE_NODE, &this->my_graph, + static_cast *>(this), static_cast *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + overwrite_node(const node_set& nodes) : overwrite_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + //! Copy constructor; doesn't take anything from src; default won't work + __TBB_NOINLINE_SYM overwrite_node( const overwrite_node& src ) : overwrite_node(src.my_graph) {} + + ~overwrite_node() {} + + bool register_successor( successor_type &s ) override { + spin_mutex::scoped_lock l( my_mutex ); + if (my_buffer_is_valid && is_graph_active( my_graph )) { + // We have a valid value that must be forwarded immediately. + bool ret = s.try_put( my_buffer ); + if ( ret ) { + // We add the successor that accepted our put + my_successors.register_successor( s ); + } else { + // In case of reservation a race between the moment of reservation and register_successor can appear, + // because failed reserve does not mean that register_successor is not ready to put a message immediately. + // We have some sort of infinite loop: reserving node tries to set pull state for the edge, + // but overwrite_node tries to return push state back. That is why we have to break this loop with task creation. + small_object_allocator allocator{}; + typedef register_predecessor_task task_type; + graph_task* t = allocator.new_object(graph_reference(), allocator, *this, s); + graph_reference().reserve_wait(); + spawn_in_graph_arena( my_graph, *t ); + } + } else { + // No valid value yet, just add as successor + my_successors.register_successor( s ); + } + return true; + } + + bool remove_successor( successor_type &s ) override { + spin_mutex::scoped_lock l( my_mutex ); + my_successors.remove_successor(s); + return true; + } + + bool try_get( input_type &v ) override { + spin_mutex::scoped_lock l( my_mutex ); + if ( my_buffer_is_valid ) { + v = my_buffer; + return true; + } + return false; + } + + //! Reserves an item + bool try_reserve( T &v ) override { + return try_get(v); + } + + //! Releases the reserved item + bool try_release() override { return true; } + + //! Consumes the reserved item + bool try_consume() override { return true; } + + bool is_valid() { + spin_mutex::scoped_lock l( my_mutex ); + return my_buffer_is_valid; + } + + void clear() { + spin_mutex::scoped_lock l( my_mutex ); + my_buffer_is_valid = false; + } + +protected: + + template< typename R, typename B > friend class run_and_put_task; + template friend class broadcast_cache; + template friend class round_robin_cache; + graph_task* try_put_task( const input_type &v ) override { + spin_mutex::scoped_lock l( my_mutex ); + return try_put_task_impl(v); + } + + graph_task * try_put_task_impl(const input_type &v) { + my_buffer = v; + my_buffer_is_valid = true; + graph_task* rtask = my_successors.try_put_task(v); + if (!rtask) rtask = SUCCESSFULLY_ENQUEUED; + return rtask; + } + + graph& graph_reference() const override { + return my_graph; + } + + //! Breaks an infinite loop between the node reservation and register_successor call + struct register_predecessor_task : public graph_task { + register_predecessor_task( + graph& g, small_object_allocator& allocator, predecessor_type& owner, successor_type& succ) + : graph_task(g, allocator), o(owner), s(succ) {}; + + task* execute(execution_data& ed) override { + // TODO revamp: investigate why qualification is needed for register_successor() call + using tbb::detail::d1::register_predecessor; + using tbb::detail::d1::register_successor; + if ( !register_predecessor(s, o) ) { + register_successor(o, s); + } + finalize(ed); + return nullptr; + } + + task* cancel(execution_data& ed) override { + finalize(ed); + return nullptr; + } + + predecessor_type& o; + successor_type& s; + }; + + spin_mutex my_mutex; + broadcast_cache< input_type, null_rw_mutex > my_successors; + input_type my_buffer; + bool my_buffer_is_valid; + + void reset_node( reset_flags f) override { + my_buffer_is_valid = false; + if (f&rf_clear_edges) { + my_successors.clear(); + } + } +}; // overwrite_node + +template< typename T > +class write_once_node : public overwrite_node { +public: + typedef T input_type; + typedef T output_type; + typedef overwrite_node base_type; + typedef typename receiver::predecessor_type predecessor_type; + typedef typename sender::successor_type successor_type; + + //! Constructor + __TBB_NOINLINE_SYM explicit write_once_node(graph& g) : base_type(g) { + fgt_node( CODEPTR(), FLOW_WRITE_ONCE_NODE, &(this->my_graph), + static_cast *>(this), + static_cast *>(this) ); + } + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + template + write_once_node(const node_set& nodes) : write_once_node(nodes.graph_reference()) { + make_edges_in_order(nodes, *this); + } +#endif + + //! Copy constructor: call base class copy constructor + __TBB_NOINLINE_SYM write_once_node( const write_once_node& src ) : base_type(src) { + fgt_node( CODEPTR(), FLOW_WRITE_ONCE_NODE, &(this->my_graph), + static_cast *>(this), + static_cast *>(this) ); + } + +protected: + template< typename R, typename B > friend class run_and_put_task; + template friend class broadcast_cache; + template friend class round_robin_cache; + graph_task *try_put_task( const T &v ) override { + spin_mutex::scoped_lock l( this->my_mutex ); + return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v); + } +}; // write_once_node + +inline void set_name(const graph& g, const char *name) { + fgt_graph_desc(&g, name); +} + +template +inline void set_name(const input_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const function_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const continue_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const broadcast_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const buffer_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const queue_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const sequencer_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const priority_queue_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const limiter_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const join_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const indexer_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const overwrite_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const write_once_node& node, const char *name) { + fgt_node_desc(&node, name); +} + +template +inline void set_name(const multifunction_node& node, const char *name) { + fgt_multioutput_node_desc(&node, name); +} + +template +inline void set_name(const split_node& node, const char *name) { + fgt_multioutput_node_desc(&node, name); +} + +template< typename InputTuple, typename OutputTuple > +inline void set_name(const composite_node& node, const char *name) { + fgt_multiinput_multioutput_node_desc(&node, name); +} + +template +inline void set_name(const async_node& node, const char *name) +{ + fgt_multioutput_node_desc(&node, name); +} +} // d1 +} // detail +} // tbb + + +// Include deduction guides for node classes +#include "third_party/tbb/detail/_flow_graph_nodes_deduction.h" + +namespace tbb { +namespace flow { +inline namespace v1 { + using detail::d1::receiver; + using detail::d1::sender; + + using detail::d1::serial; + using detail::d1::unlimited; + + using detail::d1::reset_flags; + using detail::d1::rf_reset_protocol; + using detail::d1::rf_reset_bodies; + using detail::d1::rf_clear_edges; + + using detail::d1::graph; + using detail::d1::graph_node; + using detail::d1::continue_msg; + + using detail::d1::input_node; + using detail::d1::function_node; + using detail::d1::multifunction_node; + using detail::d1::split_node; + using detail::d1::output_port; + using detail::d1::indexer_node; + using detail::d1::tagged_msg; + using detail::d1::cast_to; + using detail::d1::is_a; + using detail::d1::continue_node; + using detail::d1::overwrite_node; + using detail::d1::write_once_node; + using detail::d1::broadcast_node; + using detail::d1::buffer_node; + using detail::d1::queue_node; + using detail::d1::sequencer_node; + using detail::d1::priority_queue_node; + using detail::d1::limiter_node; + using namespace detail::d1::graph_policy_namespace; + using detail::d1::join_node; + using detail::d1::input_port; + using detail::d1::copy_body; + using detail::d1::make_edge; + using detail::d1::remove_edge; + using detail::d1::tag_value; + using detail::d1::composite_node; + using detail::d1::async_node; + using detail::d1::node_priority_t; + using detail::d1::no_priority; + +#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET + using detail::d1::follows; + using detail::d1::precedes; + using detail::d1::make_node_set; + using detail::d1::make_edges; +#endif + +} // v1 +} // flow + + using detail::d1::flow_control; + +namespace profiling { + using detail::d1::set_name; +} // profiling + +} // tbb + + +#if TBB_USE_PROFILING_TOOLS && ( __unix__ || __APPLE__ ) + // We don't do pragma pop here, since it still gives warning on the USER side + #undef __TBB_NOINLINE_SYM +#endif + +#endif // __TBB_flow_graph_H diff --git a/third_party/tbb/flow_graph_abstractions.h b/third_party/tbb/flow_graph_abstractions.h new file mode 100644 index 000000000..87921c9f9 --- /dev/null +++ b/third_party/tbb/flow_graph_abstractions.h @@ -0,0 +1,52 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_flow_graph_abstractions_H +#define __TBB_flow_graph_abstractions_H + +namespace tbb { +namespace detail { +namespace d1 { + +//! Pure virtual template classes that define interfaces for async communication +class graph_proxy { +public: + //! Inform a graph that messages may come from outside, to prevent premature graph completion + virtual void reserve_wait() = 0; + + //! Inform a graph that a previous call to reserve_wait is no longer in effect + virtual void release_wait() = 0; + + virtual ~graph_proxy() {} +}; + +template +class receiver_gateway : public graph_proxy { +public: + //! Type of inputing data into FG. + typedef Input input_type; + + //! Submit signal from an asynchronous activity to FG. + virtual bool try_put(const input_type&) = 0; +}; + +} // d1 + + +} // detail +} // tbb +#endif diff --git a/third_party/tbb/global_control.cpp b/third_party/tbb/global_control.cpp new file mode 100644 index 000000000..8d84e61ca --- /dev/null +++ b/third_party/tbb/global_control.cpp @@ -0,0 +1,281 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_template_helpers.h" + +#include "third_party/tbb/global_control.h" +#include "third_party/tbb/tbb_allocator.h" +#include "third_party/tbb/spin_mutex.h" + +#include "third_party/tbb/governor.h" +#include "third_party/tbb/threading_control.h" +#include "third_party/tbb/market.h" +#include "third_party/tbb/misc.h" + +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/set" + +namespace tbb { +namespace detail { +namespace r1 { + +//! Comparator for a set of global_control objects +struct control_storage_comparator { + bool operator()(const d1::global_control* lhs, const d1::global_control* rhs) const; +}; + +class control_storage { + friend struct global_control_impl; + friend std::size_t global_control_active_value(int); + friend void global_control_lock(); + friend void global_control_unlock(); + friend std::size_t global_control_active_value_unsafe(d1::global_control::parameter); +protected: + std::size_t my_active_value{0}; + std::set> my_list{}; + spin_mutex my_list_mutex{}; +public: + virtual std::size_t default_value() const = 0; + virtual void apply_active(std::size_t new_active) { + my_active_value = new_active; + } + virtual bool is_first_arg_preferred(std::size_t a, std::size_t b) const { + return a>b; // prefer max by default + } + virtual std::size_t active_value() { + spin_mutex::scoped_lock lock(my_list_mutex); // protect my_list.empty() call + return !my_list.empty() ? my_active_value : default_value(); + } + + std::size_t active_value_unsafe() { + return !my_list.empty() ? my_active_value : default_value(); + } +}; + +class alignas(max_nfs_size) allowed_parallelism_control : public control_storage { + std::size_t default_value() const override { + return max(1U, governor::default_num_threads()); + } + bool is_first_arg_preferred(std::size_t a, std::size_t b) const override { + return a= 1, nullptr); + // -1 to take external thread into account + threading_control::set_active_num_workers(my_active_value - 1); + } + std::size_t active_value() override { + spin_mutex::scoped_lock lock(my_list_mutex); // protect my_list.empty() call + if (my_list.empty()) { + return default_value(); + } + + // non-zero, if market is active + const std::size_t workers = threading_control::max_num_workers(); + // We can't exceed market's maximal number of workers. + // +1 to take external thread into account + return workers ? min(workers + 1, my_active_value) : my_active_value; + } +public: + std::size_t active_value_if_present() const { + return !my_list.empty() ? my_active_value : 0; + } +}; + +class alignas(max_nfs_size) stack_size_control : public control_storage { + std::size_t default_value() const override { +#if _WIN32_WINNT >= 0x0602 /* _WIN32_WINNT_WIN8 */ + static auto ThreadStackSizeDefault = [] { + ULONG_PTR hi, lo; + GetCurrentThreadStackLimits(&lo, &hi); + return hi - lo; + }(); + return ThreadStackSizeDefault; +#else + return ThreadStackSize; +#endif + } + void apply_active(std::size_t new_active) override { + control_storage::apply_active(new_active); +#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00) + __TBB_ASSERT( false, "For Windows 8 Store* apps we must not set stack size" ); +#endif + } +}; + +class alignas(max_nfs_size) terminate_on_exception_control : public control_storage { + std::size_t default_value() const override { + return 0; + } +}; + +class alignas(max_nfs_size) lifetime_control : public control_storage { + bool is_first_arg_preferred(std::size_t, std::size_t) const override { + return false; // not interested + } + std::size_t default_value() const override { + return 0; + } + void apply_active(std::size_t new_active) override { + if (new_active == 1) { + // reserve the market reference + threading_control::register_lifetime_control(); + } else if (new_active == 0) { // new_active == 0 + threading_control::unregister_lifetime_control(/*blocking_terminate*/ false); + } + control_storage::apply_active(new_active); + } + +public: + bool is_empty() { + spin_mutex::scoped_lock lock(my_list_mutex); + return my_list.empty(); + } +}; + +static allowed_parallelism_control allowed_parallelism_ctl; +static stack_size_control stack_size_ctl; +static terminate_on_exception_control terminate_on_exception_ctl; +static lifetime_control lifetime_ctl; +static control_storage *controls[] = {&allowed_parallelism_ctl, &stack_size_ctl, &terminate_on_exception_ctl, &lifetime_ctl}; + +void global_control_lock() { + for (auto& ctl : controls) { + ctl->my_list_mutex.lock(); + } +} + +void global_control_unlock() { + int N = std::distance(std::begin(controls), std::end(controls)); + for (int i = N - 1; i >= 0; --i) { + controls[i]->my_list_mutex.unlock(); + } +} + +std::size_t global_control_active_value_unsafe(d1::global_control::parameter param) { + __TBB_ASSERT_RELEASE(param < d1::global_control::parameter_max, nullptr); + return controls[param]->active_value_unsafe(); +} + +//! Comparator for a set of global_control objects +inline bool control_storage_comparator::operator()(const d1::global_control* lhs, const d1::global_control* rhs) const { + __TBB_ASSERT_RELEASE(lhs->my_param < d1::global_control::parameter_max , nullptr); + return lhs->my_value < rhs->my_value || (lhs->my_value == rhs->my_value && lhs < rhs); +} + +bool terminate_on_exception() { + return d1::global_control::active_value(d1::global_control::terminate_on_exception) == 1; +} + +struct global_control_impl { +private: + static bool erase_if_present(control_storage* const c, d1::global_control& gc) { + auto it = c->my_list.find(&gc); + if (it != c->my_list.end()) { + c->my_list.erase(it); + return true; + } + return false; + } + +public: + + static void create(d1::global_control& gc) { + __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr); + control_storage* const c = controls[gc.my_param]; + + spin_mutex::scoped_lock lock(c->my_list_mutex); + if (c->my_list.empty() || c->is_first_arg_preferred(gc.my_value, c->my_active_value)) { + // to guarantee that apply_active() is called with current active value, + // calls it here and in internal_destroy() under my_list_mutex + c->apply_active(gc.my_value); + } + c->my_list.insert(&gc); + } + + static void destroy(d1::global_control& gc) { + __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr); + control_storage* const c = controls[gc.my_param]; + // Concurrent reading and changing global parameter is possible. + spin_mutex::scoped_lock lock(c->my_list_mutex); + __TBB_ASSERT(gc.my_param == d1::global_control::scheduler_handle || !c->my_list.empty(), nullptr); + std::size_t new_active = (std::size_t)(-1), old_active = c->my_active_value; + + if (!erase_if_present(c, gc)) { + __TBB_ASSERT(gc.my_param == d1::global_control::scheduler_handle , nullptr); + return; + } + if (c->my_list.empty()) { + __TBB_ASSERT(new_active == (std::size_t) - 1, nullptr); + new_active = c->default_value(); + } else { + new_active = (*c->my_list.begin())->my_value; + } + if (new_active != old_active) { + c->apply_active(new_active); + } + } + + static bool remove_and_check_if_empty(d1::global_control& gc) { + __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr); + control_storage* const c = controls[gc.my_param]; + + spin_mutex::scoped_lock lock(c->my_list_mutex); + __TBB_ASSERT(!c->my_list.empty(), nullptr); + erase_if_present(c, gc); + return c->my_list.empty(); + } +#if TBB_USE_ASSERT + static bool is_present(d1::global_control& gc) { + __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr); + control_storage* const c = controls[gc.my_param]; + + spin_mutex::scoped_lock lock(c->my_list_mutex); + auto it = c->my_list.find(&gc); + if (it != c->my_list.end()) { + return true; + } + return false; + } +#endif // TBB_USE_ASSERT +}; + +void __TBB_EXPORTED_FUNC create(d1::global_control& gc) { + global_control_impl::create(gc); +} +void __TBB_EXPORTED_FUNC destroy(d1::global_control& gc) { + global_control_impl::destroy(gc); +} + +bool remove_and_check_if_empty(d1::global_control& gc) { + return global_control_impl::remove_and_check_if_empty(gc); +} +#if TBB_USE_ASSERT +bool is_present(d1::global_control& gc) { + return global_control_impl::is_present(gc); +} +#endif // TBB_USE_ASSERT +std::size_t __TBB_EXPORTED_FUNC global_control_active_value(int param) { + __TBB_ASSERT_RELEASE(param < d1::global_control::parameter_max, nullptr); + return controls[param]->active_value(); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/global_control.h b/third_party/tbb/global_control.h new file mode 100644 index 000000000..9740b5700 --- /dev/null +++ b/third_party/tbb/global_control.h @@ -0,0 +1,201 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_global_control_H +#define __TBB_global_control_H + +#include "third_party/tbb/detail/_config.h" + +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_attach.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_template_helpers.h" + +#include "third_party/libcxx/cstddef" +#include "third_party/libcxx/new" // std::nothrow_t + +namespace tbb { +namespace detail { + +namespace d1 { +class global_control; +class task_scheduler_handle; +} + +namespace r1 { +TBB_EXPORT void __TBB_EXPORTED_FUNC create(d1::global_control&); +TBB_EXPORT void __TBB_EXPORTED_FUNC destroy(d1::global_control&); +TBB_EXPORT std::size_t __TBB_EXPORTED_FUNC global_control_active_value(int); +struct global_control_impl; +struct control_storage_comparator; +void release_impl(d1::task_scheduler_handle& handle); +bool finalize_impl(d1::task_scheduler_handle& handle); +TBB_EXPORT void __TBB_EXPORTED_FUNC get(d1::task_scheduler_handle&); +TBB_EXPORT bool __TBB_EXPORTED_FUNC finalize(d1::task_scheduler_handle&, std::intptr_t mode); +} + +namespace d1 { + +class global_control { +public: + enum parameter { + max_allowed_parallelism, + thread_stack_size, + terminate_on_exception, + scheduler_handle, // not a public parameter + parameter_max // insert new parameters above this point + }; + + global_control(parameter p, std::size_t value) : + my_value(value), my_reserved(), my_param(p) { + suppress_unused_warning(my_reserved); + __TBB_ASSERT(my_param < parameter_max, "Invalid parameter"); +#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00) + // For Windows 8 Store* apps it's impossible to set stack size + if (p==thread_stack_size) + return; +#elif __TBB_x86_64 && (_WIN32 || _WIN64) + if (p==thread_stack_size) + __TBB_ASSERT_RELEASE((unsigned)value == value, "Stack size is limited to unsigned int range"); +#endif + if (my_param==max_allowed_parallelism) + __TBB_ASSERT_RELEASE(my_value>0, "max_allowed_parallelism cannot be 0."); + r1::create(*this); + } + + ~global_control() { + __TBB_ASSERT(my_param < parameter_max, "Invalid parameter"); +#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00) + // For Windows 8 Store* apps it's impossible to set stack size + if (my_param==thread_stack_size) + return; +#endif + r1::destroy(*this); + } + + static std::size_t active_value(parameter p) { + __TBB_ASSERT(p < parameter_max, "Invalid parameter"); + return r1::global_control_active_value((int)p); + } + +private: + std::size_t my_value; + std::intptr_t my_reserved; // TODO: substitution of global_control* not to break backward compatibility + parameter my_param; + + friend struct r1::global_control_impl; + friend struct r1::control_storage_comparator; +}; + +//! Finalization options. +//! Outside of the class to avoid extensive friendship. +static constexpr std::intptr_t release_nothrowing = 0; +static constexpr std::intptr_t finalize_nothrowing = 1; +static constexpr std::intptr_t finalize_throwing = 2; + +//! User side wrapper for a task scheduler lifetime control object +class task_scheduler_handle { +public: + //! Creates an empty task_scheduler_handle + task_scheduler_handle() = default; + + //! Creates an attached instance of task_scheduler_handle + task_scheduler_handle(attach) { + r1::get(*this); + } + + //! Release a reference if any + ~task_scheduler_handle() { + release(); + } + + //! No copy + task_scheduler_handle(const task_scheduler_handle& other) = delete; + task_scheduler_handle& operator=(const task_scheduler_handle& other) = delete; + + //! Move only + task_scheduler_handle(task_scheduler_handle&& other) noexcept { + std::swap(m_ctl, other.m_ctl); + } + task_scheduler_handle& operator=(task_scheduler_handle&& other) noexcept { + std::swap(m_ctl, other.m_ctl); + return *this; + }; + + //! Checks if the task_scheduler_handle is empty + explicit operator bool() const noexcept { + return m_ctl != nullptr; + } + + //! Release the reference and deactivate handle + void release() { + if (m_ctl != nullptr) { + r1::finalize(*this, release_nothrowing); + m_ctl = nullptr; + } + } + +private: + friend void r1::release_impl(task_scheduler_handle& handle); + friend bool r1::finalize_impl(task_scheduler_handle& handle); + friend void __TBB_EXPORTED_FUNC r1::get(task_scheduler_handle&); + + friend void finalize(task_scheduler_handle&); + friend bool finalize(task_scheduler_handle&, const std::nothrow_t&) noexcept; + + global_control* m_ctl{nullptr}; +}; + +#if TBB_USE_EXCEPTIONS +//! Waits for worker threads termination. Throws exception on error. +inline void finalize(task_scheduler_handle& handle) { + try_call([&] { + if (handle.m_ctl != nullptr) { + bool finalized = r1::finalize(handle, finalize_throwing); + __TBB_ASSERT_EX(finalized, "r1::finalize did not respect finalize_throwing ?"); + + } + }).on_completion([&] { + __TBB_ASSERT(!handle, "The handle should be empty after finalize"); + }); +} +#endif +//! Waits for worker threads termination. Returns false on error. +inline bool finalize(task_scheduler_handle& handle, const std::nothrow_t&) noexcept { + bool finalized = true; + if (handle.m_ctl != nullptr) { + finalized = r1::finalize(handle, finalize_nothrowing); + } + __TBB_ASSERT(!handle, "The handle should be empty after finalize"); + return finalized; +} + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::global_control; +using detail::d1::attach; +using detail::d1::finalize; +using detail::d1::task_scheduler_handle; +using detail::r1::unsafe_wait; +} // namespace v1 + +} // namespace tbb + +#endif // __TBB_global_control_H diff --git a/third_party/tbb/governor.cpp b/third_party/tbb/governor.cpp new file mode 100644 index 000000000..91f3db3a1 --- /dev/null +++ b/third_party/tbb/governor.cpp @@ -0,0 +1,580 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/governor.h" +#include "third_party/tbb/threading_control.h" +#include "third_party/tbb/main.h" +#include "third_party/tbb/thread_data.h" +#include "third_party/tbb/market.h" +#include "third_party/tbb/arena.h" +#include "third_party/tbb/dynamic_link.h" +#include "third_party/tbb/concurrent_monitor.h" +#include "third_party/tbb/thread_dispatcher.h" + +#include "third_party/tbb/task_group.h" +#include "third_party/tbb/global_control.h" +#include "third_party/tbb/tbb_allocator.h" +#include "third_party/tbb/info.h" + +#include "third_party/tbb/task_dispatcher.h" + +#include "third_party/libcxx/cstdio" +#include "third_party/libcxx/cstdlib" +#include "third_party/libcxx/cstring" +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/algorithm" + +namespace tbb { +namespace detail { +namespace r1 { + +void clear_address_waiter_table(); + +//! global_control.cpp contains definition +bool remove_and_check_if_empty(d1::global_control& gc); +bool is_present(d1::global_control& gc); + +namespace rml { +tbb_server* make_private_server( tbb_client& client ); +} // namespace rml + +namespace system_topology { + void destroy(); +} + +//------------------------------------------------------------------------ +// governor +//------------------------------------------------------------------------ + +void governor::acquire_resources () { +#if __TBB_USE_POSIX + int status = theTLS.create(auto_terminate); +#else + int status = theTLS.create(); +#endif + if( status ) + handle_perror(status, "TBB failed to initialize task scheduler TLS\n"); + detect_cpu_features(cpu_features); + + is_rethrow_broken = gcc_rethrow_exception_broken(); +} + +void governor::release_resources () { + theRMLServerFactory.close(); + destroy_process_mask(); + + __TBB_ASSERT(!(__TBB_InitOnce::initialization_done() && theTLS.get()), "TBB is unloaded while thread data still alive?"); + + int status = theTLS.destroy(); + if( status ) + runtime_warning("failed to destroy task scheduler TLS: %s", std::strerror(status)); + clear_address_waiter_table(); + + system_topology::destroy(); + dynamic_unlink_all(); +} + +rml::tbb_server* governor::create_rml_server ( rml::tbb_client& client ) { + rml::tbb_server* server = nullptr; + if( !UsePrivateRML ) { + ::rml::factory::status_type status = theRMLServerFactory.make_server( server, client ); + if( status != ::rml::factory::st_success ) { + UsePrivateRML = true; + runtime_warning( "rml::tbb_factory::make_server failed with status %x, falling back on private rml", status ); + } + } + if ( !server ) { + __TBB_ASSERT( UsePrivateRML, nullptr); + server = rml::make_private_server( client ); + } + __TBB_ASSERT( server, "Failed to create RML server" ); + return server; +} + +void governor::one_time_init() { + if ( !__TBB_InitOnce::initialization_done() ) { + DoOneTimeInitialization(); + } +} + +bool governor::does_client_join_workers(const rml::tbb_client &client) { + return ((const thread_dispatcher&)client).must_join_workers(); +} + +/* + There is no portable way to get stack base address in Posix, however the modern + Linux versions provide pthread_attr_np API that can be used to obtain thread's + stack size and base address. Unfortunately even this function does not provide + enough information for the main thread on IA-64 architecture (RSE spill area + and memory stack are allocated as two separate discontinuous chunks of memory), + and there is no portable way to discern the main and the secondary threads. + Thus for macOS* and IA-64 architecture for Linux* OS we use the TBB worker stack size for + all threads and use the current stack top as the stack base. This simplified + approach is based on the following assumptions: + 1) If the default stack size is insufficient for the user app needs, the + required amount will be explicitly specified by the user at the point of the + TBB scheduler initialization (as an argument to tbb::task_scheduler_init + constructor). + 2) When an external thread initializes the scheduler, it has enough space on its + stack. Here "enough" means "at least as much as worker threads have". + 3) If the user app strives to conserve the memory by cutting stack size, it + should do this for TBB workers too (as in the #1). +*/ +static std::uintptr_t get_stack_base(std::size_t stack_size) { + // Stacks are growing top-down. Highest address is called "stack base", + // and the lowest is "stack limit". +#if __TBB_USE_WINAPI + suppress_unused_warning(stack_size); + NT_TIB* pteb = (NT_TIB*)NtCurrentTeb(); + __TBB_ASSERT(&pteb < pteb->StackBase && &pteb > pteb->StackLimit, "invalid stack info in TEB"); + return reinterpret_cast(pteb->StackBase); +#else + // There is no portable way to get stack base address in Posix, so we use + // non-portable method (on all modern Linux) or the simplified approach + // based on the common sense assumptions. The most important assumption + // is that the main thread's stack size is not less than that of other threads. + + // Points to the lowest addressable byte of a stack. + void* stack_limit = nullptr; +#if __linux__ && !__bg__ + size_t np_stack_size = 0; + pthread_attr_t np_attr_stack; + if (0 == pthread_getattr_np(pthread_self(), &np_attr_stack)) { + if (0 == pthread_attr_getstack(&np_attr_stack, &stack_limit, &np_stack_size)) { + __TBB_ASSERT( &stack_limit > stack_limit, "stack size must be positive" ); + } + pthread_attr_destroy(&np_attr_stack); + } +#endif /* __linux__ */ + std::uintptr_t stack_base{}; + if (stack_limit) { + stack_base = reinterpret_cast(stack_limit) + stack_size; + } else { + // Use an anchor as a base stack address. + int anchor{}; + stack_base = reinterpret_cast(&anchor); + } + return stack_base; +#endif /* __TBB_USE_WINAPI */ +} + +#if (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED +static void register_external_thread_destructor() { + struct thread_destructor { + ~thread_destructor() { + governor::terminate_external_thread(); + } + }; + // ~thread_destructor() will be call during the calling thread termination + static thread_local thread_destructor thr_destructor; +} +#endif // (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED + +void governor::init_external_thread() { + one_time_init(); + // Create new scheduler instance with arena + int num_slots = default_num_threads(); + // TODO_REVAMP: support an external thread without an implicit arena + int num_reserved_slots = 1; + unsigned arena_priority_level = 1; // corresponds to tbb::task_arena::priority::normal + std::size_t stack_size = 0; + threading_control* thr_control = threading_control::register_public_reference(); + arena& a = arena::create(thr_control, num_slots, num_reserved_slots, arena_priority_level); + // External thread always occupies the first slot + thread_data& td = *new(cache_aligned_allocate(sizeof(thread_data))) thread_data(0, false); + td.attach_arena(a, /*slot index*/ 0); + __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr); + + stack_size = a.my_threading_control->worker_stack_size(); + std::uintptr_t stack_base = get_stack_base(stack_size); + task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher(); + td.enter_task_dispatcher(task_disp, calculate_stealing_threshold(stack_base, stack_size)); + + td.my_arena_slot->occupy(); + thr_control->register_thread(td); + set_thread_data(td); +#if (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED + // The external thread destructor is called from dllMain but it is not available with a static build. + // Therefore, we need to register the current thread to call the destructor during thread termination. + register_external_thread_destructor(); +#endif +} + +void governor::auto_terminate(void* tls) { + __TBB_ASSERT(get_thread_data_if_initialized() == nullptr || + get_thread_data_if_initialized() == tls, nullptr); + if (tls) { + thread_data* td = static_cast(tls); + + auto clear_tls = [td] { + td->~thread_data(); + cache_aligned_deallocate(td); + clear_thread_data(); + }; + + // Only external thread can be inside an arena during termination. + if (td->my_arena_slot) { + arena* a = td->my_arena; + threading_control* thr_control = a->my_threading_control; + + // If the TLS slot is already cleared by OS or underlying concurrency + // runtime, restore its value to properly clean up arena + if (!is_thread_data_set(td)) { + set_thread_data(*td); + } + + a->my_observers.notify_exit_observers(td->my_last_observer, td->my_is_worker); + + td->leave_task_dispatcher(); + td->my_arena_slot->release(); + // Release an arena + a->on_thread_leaving(arena::ref_external); + + thr_control->unregister_thread(*td); + + // The tls should be cleared before market::release because + // market can destroy the tls key if we keep the last reference + clear_tls(); + + // If there was an associated arena, it added a public market reference + thr_control->unregister_public_reference(/* blocking terminate =*/ false); + } else { + clear_tls(); + } + } + __TBB_ASSERT(get_thread_data_if_initialized() == nullptr, nullptr); +} + +void governor::initialize_rml_factory () { + ::rml::factory::status_type res = theRMLServerFactory.open(); + UsePrivateRML = res != ::rml::factory::st_success; +} + +void __TBB_EXPORTED_FUNC get(d1::task_scheduler_handle& handle) { + handle.m_ctl = new(allocate_memory(sizeof(global_control))) global_control(global_control::scheduler_handle, 1); +} + +void release_impl(d1::task_scheduler_handle& handle) { + if (handle.m_ctl != nullptr) { + handle.m_ctl->~global_control(); + deallocate_memory(handle.m_ctl); + handle.m_ctl = nullptr; + } +} + +bool finalize_impl(d1::task_scheduler_handle& handle) { + __TBB_ASSERT_RELEASE(handle, "trying to finalize with null handle"); + __TBB_ASSERT(is_present(*handle.m_ctl), "finalize or release was already called on this object"); + + bool ok = true; // ok if threading_control does not exist yet + if (threading_control::is_present()) { + thread_data* td = governor::get_thread_data_if_initialized(); + if (td) { + task_dispatcher* task_disp = td->my_task_dispatcher; + __TBB_ASSERT(task_disp, nullptr); + if (task_disp->m_properties.outermost && !td->my_is_worker) { // is not inside a parallel region + governor::auto_terminate(td); + } + } + + if (remove_and_check_if_empty(*handle.m_ctl)) { + ok = threading_control::unregister_lifetime_control(/*blocking_terminate*/ true); + } else { + ok = false; + } + } + + return ok; +} + +bool __TBB_EXPORTED_FUNC finalize(d1::task_scheduler_handle& handle, std::intptr_t mode) { + if (mode == d1::release_nothrowing) { + release_impl(handle); + return true; + } else { + bool ok = finalize_impl(handle); + // TODO: it is unsafe when finalize is called concurrently and further library unload + release_impl(handle); + if (mode == d1::finalize_throwing && !ok) { + throw_exception(exception_id::unsafe_wait); + } + return ok; + } +} + +#if __TBB_ARENA_BINDING + +#if __TBB_WEAK_SYMBOLS_PRESENT +#pragma weak __TBB_internal_initialize_system_topology +#pragma weak __TBB_internal_destroy_system_topology +#pragma weak __TBB_internal_allocate_binding_handler +#pragma weak __TBB_internal_deallocate_binding_handler +#pragma weak __TBB_internal_apply_affinity +#pragma weak __TBB_internal_restore_affinity +#pragma weak __TBB_internal_get_default_concurrency + +extern "C" { +void __TBB_internal_initialize_system_topology( + size_t groups_num, + int& numa_nodes_count, int*& numa_indexes_list, + int& core_types_count, int*& core_types_indexes_list +); +void __TBB_internal_destroy_system_topology( ); + +//TODO: consider renaming to `create_binding_handler` and `destroy_binding_handler` +binding_handler* __TBB_internal_allocate_binding_handler( int slot_num, int numa_id, int core_type_id, int max_threads_per_core ); +void __TBB_internal_deallocate_binding_handler( binding_handler* handler_ptr ); + +void __TBB_internal_apply_affinity( binding_handler* handler_ptr, int slot_num ); +void __TBB_internal_restore_affinity( binding_handler* handler_ptr, int slot_num ); + +int __TBB_internal_get_default_concurrency( int numa_id, int core_type_id, int max_threads_per_core ); +} +#endif /* __TBB_WEAK_SYMBOLS_PRESENT */ + +// Stubs that will be used if TBBbind library is unavailable. +static void dummy_destroy_system_topology ( ) { } +static binding_handler* dummy_allocate_binding_handler ( int, int, int, int ) { return nullptr; } +static void dummy_deallocate_binding_handler ( binding_handler* ) { } +static void dummy_apply_affinity ( binding_handler*, int ) { } +static void dummy_restore_affinity ( binding_handler*, int ) { } +static int dummy_get_default_concurrency( int, int, int ) { return governor::default_num_threads(); } + +// Handlers for communication with TBBbind +static void (*initialize_system_topology_ptr)( + size_t groups_num, + int& numa_nodes_count, int*& numa_indexes_list, + int& core_types_count, int*& core_types_indexes_list +) = nullptr; +static void (*destroy_system_topology_ptr)( ) = dummy_destroy_system_topology; + +static binding_handler* (*allocate_binding_handler_ptr)( int slot_num, int numa_id, int core_type_id, int max_threads_per_core ) + = dummy_allocate_binding_handler; +static void (*deallocate_binding_handler_ptr)( binding_handler* handler_ptr ) + = dummy_deallocate_binding_handler; +static void (*apply_affinity_ptr)( binding_handler* handler_ptr, int slot_num ) + = dummy_apply_affinity; +static void (*restore_affinity_ptr)( binding_handler* handler_ptr, int slot_num ) + = dummy_restore_affinity; +int (*get_default_concurrency_ptr)( int numa_id, int core_type_id, int max_threads_per_core ) + = dummy_get_default_concurrency; + +#if _WIN32 || _WIN64 || __unix__ +// Table describing how to link the handlers. +static const dynamic_link_descriptor TbbBindLinkTable[] = { + DLD(__TBB_internal_initialize_system_topology, initialize_system_topology_ptr), + DLD(__TBB_internal_destroy_system_topology, destroy_system_topology_ptr), + DLD(__TBB_internal_allocate_binding_handler, allocate_binding_handler_ptr), + DLD(__TBB_internal_deallocate_binding_handler, deallocate_binding_handler_ptr), + DLD(__TBB_internal_apply_affinity, apply_affinity_ptr), + DLD(__TBB_internal_restore_affinity, restore_affinity_ptr), + DLD(__TBB_internal_get_default_concurrency, get_default_concurrency_ptr) +}; + +static const unsigned LinkTableSize = sizeof(TbbBindLinkTable) / sizeof(dynamic_link_descriptor); + +#if TBB_USE_DEBUG +#define DEBUG_SUFFIX "_debug" +#else +#define DEBUG_SUFFIX +#endif /* TBB_USE_DEBUG */ + +#if _WIN32 || _WIN64 +#define LIBRARY_EXTENSION ".dll" +#define LIBRARY_PREFIX +#elif __unix__ +#define LIBRARY_EXTENSION __TBB_STRING(.so.3) +#define LIBRARY_PREFIX "lib" +#endif /* __unix__ */ + +#define TBBBIND_NAME LIBRARY_PREFIX "tbbbind" DEBUG_SUFFIX LIBRARY_EXTENSION +#define TBBBIND_2_0_NAME LIBRARY_PREFIX "tbbbind_2_0" DEBUG_SUFFIX LIBRARY_EXTENSION + +#define TBBBIND_2_5_NAME LIBRARY_PREFIX "tbbbind_2_5" DEBUG_SUFFIX LIBRARY_EXTENSION +#endif /* _WIN32 || _WIN64 || __unix__ */ + +// Representation of system hardware topology information on the TBB side. +// System topology may be initialized by third-party component (e.g. hwloc) +// or just filled in with default stubs. +namespace system_topology { + +constexpr int automatic = -1; + +static std::atomic initialization_state; + +namespace { +int numa_nodes_count = 0; +int* numa_nodes_indexes = nullptr; + +int core_types_count = 0; +int* core_types_indexes = nullptr; + +const char* load_tbbbind_shared_object() { +#if _WIN32 || _WIN64 || __unix__ +#if _WIN32 && !_WIN64 + // For 32-bit Windows applications, process affinity masks can only support up to 32 logical CPUs. + SYSTEM_INFO si; + GetNativeSystemInfo(&si); + if (si.dwNumberOfProcessors > 32) return nullptr; +#endif /* _WIN32 && !_WIN64 */ + for (const auto& tbbbind_version : {TBBBIND_2_5_NAME, TBBBIND_2_0_NAME, TBBBIND_NAME}) { + if (dynamic_link(tbbbind_version, TbbBindLinkTable, LinkTableSize, nullptr, DYNAMIC_LINK_LOCAL_BINDING)) { + return tbbbind_version; + } + } +#endif /* _WIN32 || _WIN64 || __unix__ */ + return nullptr; +} + +int processor_groups_num() { +#if _WIN32 + return NumberOfProcessorGroups(); +#else + // Stub to improve code readability by reducing number of the compile-time conditions + return 1; +#endif +} +} // internal namespace + +// Tries to load TBBbind library API, if success, gets NUMA topology information from it, +// in another case, fills NUMA topology by stubs. +void initialization_impl() { + governor::one_time_init(); + + if (const char* tbbbind_name = load_tbbbind_shared_object()) { + initialize_system_topology_ptr( + processor_groups_num(), + numa_nodes_count, numa_nodes_indexes, + core_types_count, core_types_indexes + ); + + PrintExtraVersionInfo("TBBBIND", tbbbind_name); + return; + } + + static int dummy_index = automatic; + + numa_nodes_count = 1; + numa_nodes_indexes = &dummy_index; + + core_types_count = 1; + core_types_indexes = &dummy_index; + + PrintExtraVersionInfo("TBBBIND", "UNAVAILABLE"); +} + +void initialize() { + atomic_do_once(initialization_impl, initialization_state); +} + +void destroy() { + destroy_system_topology_ptr(); +} +} // namespace system_topology + +binding_handler* construct_binding_handler(int slot_num, int numa_id, int core_type_id, int max_threads_per_core) { + system_topology::initialize(); + return allocate_binding_handler_ptr(slot_num, numa_id, core_type_id, max_threads_per_core); +} + +void destroy_binding_handler(binding_handler* handler_ptr) { + __TBB_ASSERT(deallocate_binding_handler_ptr, "tbbbind loading was not performed"); + deallocate_binding_handler_ptr(handler_ptr); +} + +void apply_affinity_mask(binding_handler* handler_ptr, int slot_index) { + __TBB_ASSERT(slot_index >= 0, "Negative thread index"); + __TBB_ASSERT(apply_affinity_ptr, "tbbbind loading was not performed"); + apply_affinity_ptr(handler_ptr, slot_index); +} + +void restore_affinity_mask(binding_handler* handler_ptr, int slot_index) { + __TBB_ASSERT(slot_index >= 0, "Negative thread index"); + __TBB_ASSERT(restore_affinity_ptr, "tbbbind loading was not performed"); + restore_affinity_ptr(handler_ptr, slot_index); +} + +unsigned __TBB_EXPORTED_FUNC numa_node_count() { + system_topology::initialize(); + return system_topology::numa_nodes_count; +} + +void __TBB_EXPORTED_FUNC fill_numa_indices(int* index_array) { + system_topology::initialize(); + std::memcpy(index_array, system_topology::numa_nodes_indexes, system_topology::numa_nodes_count * sizeof(int)); +} + +int __TBB_EXPORTED_FUNC numa_default_concurrency(int node_id) { + if (node_id >= 0) { + system_topology::initialize(); + int result = get_default_concurrency_ptr( + node_id, + /*core_type*/system_topology::automatic, + /*threads_per_core*/system_topology::automatic + ); + if (result > 0) return result; + } + return governor::default_num_threads(); +} + +unsigned __TBB_EXPORTED_FUNC core_type_count(intptr_t /*reserved*/) { + system_topology::initialize(); + return system_topology::core_types_count; +} + +void __TBB_EXPORTED_FUNC fill_core_type_indices(int* index_array, intptr_t /*reserved*/) { + system_topology::initialize(); + std::memcpy(index_array, system_topology::core_types_indexes, system_topology::core_types_count * sizeof(int)); +} + +void constraints_assertion(d1::constraints c) { + bool is_topology_initialized = system_topology::initialization_state == do_once_state::initialized; + __TBB_ASSERT_RELEASE(c.max_threads_per_core == system_topology::automatic || c.max_threads_per_core > 0, + "Wrong max_threads_per_core constraints field value."); + + auto numa_nodes_begin = system_topology::numa_nodes_indexes; + auto numa_nodes_end = system_topology::numa_nodes_indexes + system_topology::numa_nodes_count; + __TBB_ASSERT_RELEASE( + c.numa_id == system_topology::automatic || + (is_topology_initialized && std::find(numa_nodes_begin, numa_nodes_end, c.numa_id) != numa_nodes_end), + "The constraints::numa_id value is not known to the library. Use tbb::info::numa_nodes() to get the list of possible values."); + + int* core_types_begin = system_topology::core_types_indexes; + int* core_types_end = system_topology::core_types_indexes + system_topology::core_types_count; + __TBB_ASSERT_RELEASE(c.core_type == system_topology::automatic || + (is_topology_initialized && std::find(core_types_begin, core_types_end, c.core_type) != core_types_end), + "The constraints::core_type value is not known to the library. Use tbb::info::core_types() to get the list of possible values."); +} + +int __TBB_EXPORTED_FUNC constraints_default_concurrency(const d1::constraints& c, intptr_t /*reserved*/) { + constraints_assertion(c); + + if (c.numa_id >= 0 || c.core_type >= 0 || c.max_threads_per_core > 0) { + system_topology::initialize(); + return get_default_concurrency_ptr(c.numa_id, c.core_type, c.max_threads_per_core); + } + return governor::default_num_threads(); +} + +int __TBB_EXPORTED_FUNC constraints_threads_per_core(const d1::constraints&, intptr_t /*reserved*/) { + return system_topology::automatic; +} +#endif /* __TBB_ARENA_BINDING */ + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/governor.h b/third_party/tbb/governor.h new file mode 100644 index 000000000..b1efe08c8 --- /dev/null +++ b/third_party/tbb/governor.h @@ -0,0 +1,157 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_governor_H +#define _TBB_governor_H + +#include "third_party/tbb/rml_tbb.h" + +#include "third_party/tbb/misc.h" // for AvailableHwConcurrency +#include "third_party/tbb/tls.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class market; +class thread_data; +class __TBB_InitOnce; + +#if __TBB_USE_ITT_NOTIFY +//! Defined in profiling.cpp +extern bool ITT_Present; +#endif + +typedef std::size_t stack_size_type; + +//------------------------------------------------------------------------ +// Class governor +//------------------------------------------------------------------------ + +//! The class handles access to the single instance of market, and to TLS to keep scheduler instances. +/** It also supports automatic on-demand initialization of the TBB scheduler. + The class contains only static data members and methods.*/ +class governor { +private: + friend class __TBB_InitOnce; + friend class thread_dispatcher; + friend class threading_control_impl; + + // TODO: consider using thread_local (measure performance and side effects) + //! TLS for scheduler instances associated with individual threads + static basic_tls theTLS; + + // TODO (TBB_REVAMP_TODO): reconsider constant names + static rml::tbb_factory theRMLServerFactory; + + static bool UsePrivateRML; + + // Flags for runtime-specific conditions + static cpu_features_type cpu_features; + static bool is_rethrow_broken; + + //! Create key for thread-local storage and initialize RML. + static void acquire_resources (); + + //! Destroy the thread-local storage key and deinitialize RML. + static void release_resources (); + + static rml::tbb_server* create_rml_server ( rml::tbb_client& ); + +public: + static unsigned default_num_threads () { + // Caches the maximal level of parallelism supported by the hardware + static unsigned num_threads = AvailableHwConcurrency(); + return num_threads; + } + static std::size_t default_page_size () { + // Caches the size of OS regular memory page + static std::size_t page_size = DefaultSystemPageSize(); + return page_size; + } + static void one_time_init(); + //! Processes scheduler initialization request (possibly nested) in an external thread + /** If necessary creates new instance of arena and/or local scheduler. + The auto_init argument specifies if the call is due to automatic initialization. **/ + static void init_external_thread(); + + //! The routine to undo automatic initialization. + /** The signature is written with void* so that the routine + can be the destructor argument to pthread_key_create. */ + static void auto_terminate(void* tls); + + //! Obtain the thread-local instance of the thread data. + /** If the scheduler has not been initialized yet, initialization is done automatically. + Note that auto-initialized scheduler instance is destroyed only when its thread terminates. **/ + static thread_data* get_thread_data() { + thread_data* td = theTLS.get(); + if (td) { + return td; + } + init_external_thread(); + td = theTLS.get(); + __TBB_ASSERT(td, nullptr); + return td; + } + + static void set_thread_data(thread_data& td) { + theTLS.set(&td); + } + + static void clear_thread_data() { + theTLS.set(nullptr); + } + + static thread_data* get_thread_data_if_initialized () { + return theTLS.get(); + } + + static bool is_thread_data_set(thread_data* td) { + return theTLS.get() == td; + } + + //! Undo automatic initialization if necessary; call when a thread exits. + static void terminate_external_thread() { + auto_terminate(get_thread_data_if_initialized()); + } + + static void initialize_rml_factory (); + + static bool does_client_join_workers (const rml::tbb_client &client); + + static bool speculation_enabled() { return cpu_features.rtm_enabled; } + +#if __TBB_WAITPKG_INTRINSICS_PRESENT + static bool wait_package_enabled() { return cpu_features.waitpkg_enabled; } +#endif + + static bool rethrow_exception_broken() { return is_rethrow_broken; } + + static bool is_itt_present() { +#if __TBB_USE_ITT_NOTIFY + return ITT_Present; +#else + return false; +#endif + } +}; // class governor + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* _TBB_governor_H */ diff --git a/third_party/tbb/info.h b/third_party/tbb/info.h new file mode 100644 index 000000000..b90d38bb2 --- /dev/null +++ b/third_party/tbb/info.h @@ -0,0 +1,126 @@ +// clang-format off +/* + Copyright (c) 2019-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_info_H +#define __TBB_info_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" + +#if __TBB_ARENA_BINDING +#include "third_party/libcxx/vector" +#include "third_party/libcxx/cstdint" + +namespace tbb { +namespace detail { + +namespace d1{ + +using numa_node_id = int; +using core_type_id = int; + +// TODO: consider version approach to resolve backward compatibility potential issues. +struct constraints { +#if !__TBB_CPP20_PRESENT + constraints(numa_node_id id = -1, int maximal_concurrency = -1) + : numa_id(id) + , max_concurrency(maximal_concurrency) + {} +#endif /*!__TBB_CPP20_PRESENT*/ + + constraints& set_numa_id(numa_node_id id) { + numa_id = id; + return *this; + } + constraints& set_max_concurrency(int maximal_concurrency) { + max_concurrency = maximal_concurrency; + return *this; + } + constraints& set_core_type(core_type_id id) { + core_type = id; + return *this; + } + constraints& set_max_threads_per_core(int threads_number) { + max_threads_per_core = threads_number; + return *this; + } + + numa_node_id numa_id = -1; + int max_concurrency = -1; + core_type_id core_type = -1; + int max_threads_per_core = -1; +}; + +} // namespace d1 + +namespace r1 { +TBB_EXPORT unsigned __TBB_EXPORTED_FUNC numa_node_count(); +TBB_EXPORT void __TBB_EXPORTED_FUNC fill_numa_indices(int* index_array); +TBB_EXPORT int __TBB_EXPORTED_FUNC numa_default_concurrency(int numa_id); + +// Reserved fields are required to save binary backward compatibility in case of future changes. +// They must be defined to 0 at this moment. +TBB_EXPORT unsigned __TBB_EXPORTED_FUNC core_type_count(intptr_t reserved = 0); +TBB_EXPORT void __TBB_EXPORTED_FUNC fill_core_type_indices(int* index_array, intptr_t reserved = 0); + +TBB_EXPORT int __TBB_EXPORTED_FUNC constraints_default_concurrency(const d1::constraints& c, intptr_t reserved = 0); +TBB_EXPORT int __TBB_EXPORTED_FUNC constraints_threads_per_core(const d1::constraints& c, intptr_t reserved = 0); +} // namespace r1 + +namespace d1 { + +inline std::vector numa_nodes() { + std::vector node_indices(r1::numa_node_count()); + r1::fill_numa_indices(node_indices.data()); + return node_indices; +} + +inline int default_concurrency(numa_node_id id = -1) { + return r1::numa_default_concurrency(id); +} + +inline std::vector core_types() { + std::vector core_type_indexes(r1::core_type_count()); + r1::fill_core_type_indices(core_type_indexes.data()); + return core_type_indexes; +} + +inline int default_concurrency(constraints c) { + if (c.max_concurrency > 0) { return c.max_concurrency; } + return r1::constraints_default_concurrency(c); +} + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::numa_node_id; +using detail::d1::core_type_id; + +namespace info { +using detail::d1::numa_nodes; +using detail::d1::core_types; + +using detail::d1::default_concurrency; +} // namespace info +} // namespace v1 + +} // namespace tbb + +#endif /*__TBB_ARENA_BINDING*/ + +#endif /*__TBB_info_H*/ diff --git a/third_party/tbb/intrusive_list.h b/third_party/tbb/intrusive_list.h new file mode 100644 index 000000000..c0f1b19e2 --- /dev/null +++ b/third_party/tbb/intrusive_list.h @@ -0,0 +1,234 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_intrusive_list_H +#define _TBB_intrusive_list_H + +#include "third_party/tbb/detail/_intrusive_list_node.h" + +namespace tbb { +namespace detail { +namespace r1 { + +using d1::intrusive_list_node; + +//! List of element of type T, where T is derived from intrusive_list_node +/** The class is not thread safe. **/ +template +class intrusive_list_base { + //! Pointer to the head node + intrusive_list_node my_head; + + //! Number of list elements + std::size_t my_size; + + static intrusive_list_node& node ( T& item ) { return List::node(item); } + + static T& item ( intrusive_list_node* node ) { return List::item(node); } + + static const T& item( const intrusive_list_node* node ) { return List::item(node); } + + template + class iterator_impl { + static_assert(std::is_same::value || + std::is_same::value, + "Incorrect DereferenceType in iterator_impl"); + + using pointer_type = typename std::conditional::value, + intrusive_list_node*, + const intrusive_list_node*>::type; + + public: + iterator_impl() : my_pos(nullptr) {} + + iterator_impl( pointer_type pos ) : my_pos(pos) {} + + iterator_impl& operator++() { + my_pos = my_pos->my_next_node; + return *this; + } + + iterator_impl operator++( int ) { + iterator_impl it(*this); + ++*this; + return it; + } + + iterator_impl& operator--() { + my_pos = my_pos->my_prev_node; + return *this; + } + + iterator_impl operator--( int ) { + iterator_impl it(*this); + --*this; + return it; + } + + bool operator==( const iterator_impl& rhs ) const { + return my_pos == rhs.my_pos; + } + + bool operator!=( const iterator_impl& rhs ) const { + return my_pos != rhs.my_pos; + } + + DereferenceType& operator*() const { + return intrusive_list_base::item(my_pos); + } + + DereferenceType* operator->() const { + return &intrusive_list_base::item(my_pos); + } + private: + // Node the iterator points to at the moment + pointer_type my_pos; + }; // class iterator_impl + + void assert_ok () const { + __TBB_ASSERT( (my_head.my_prev_node == &my_head && !my_size) || + (my_head.my_next_node != &my_head && my_size >0), "intrusive_list_base corrupted" ); +#if TBB_USE_ASSERT >= 2 + std::size_t i = 0; + for ( intrusive_list_node *n = my_head.my_next_node; n != &my_head; n = n->my_next_node ) + ++i; + __TBB_ASSERT( my_size == i, "Wrong size" ); +#endif /* TBB_USE_ASSERT >= 2 */ + } + +public: + using iterator = iterator_impl; + using const_iterator = iterator_impl; + + intrusive_list_base () : my_size(0) { + my_head.my_prev_node = &my_head; + my_head.my_next_node = &my_head; + } + + bool empty () const { return my_head.my_next_node == &my_head; } + + std::size_t size () const { return my_size; } + + iterator begin () { return iterator(my_head.my_next_node); } + + iterator end () { return iterator(&my_head); } + + const_iterator begin () const { return const_iterator(my_head.my_next_node); } + + const_iterator end () const { return const_iterator(&my_head); } + + void push_front ( T& val ) { + __TBB_ASSERT( node(val).my_prev_node == &node(val) && node(val).my_next_node == &node(val), + "Object with intrusive list node can be part of only one intrusive list simultaneously" ); + // An object can be part of only one intrusive list at the given moment via the given node member + node(val).my_prev_node = &my_head; + node(val).my_next_node = my_head.my_next_node; + my_head.my_next_node->my_prev_node = &node(val); + my_head.my_next_node = &node(val); + ++my_size; + assert_ok(); + } + + void remove( T& val ) { + __TBB_ASSERT( node(val).my_prev_node != &node(val) && node(val).my_next_node != &node(val), "Element to remove is not in the list" ); + __TBB_ASSERT( node(val).my_prev_node->my_next_node == &node(val) && node(val).my_next_node->my_prev_node == &node(val), "Element to remove is not in the list" ); + --my_size; + node(val).my_next_node->my_prev_node = node(val).my_prev_node; + node(val).my_prev_node->my_next_node = node(val).my_next_node; +#if TBB_USE_ASSERT + node(val).my_prev_node = node(val).my_next_node = &node(val); +#endif + assert_ok(); + } + + iterator erase ( iterator it ) { + T& val = *it; + ++it; + remove( val ); + return it; + } + +}; // intrusive_list_base + +#if __TBB_TODO +// With standard compliant compilers memptr_intrusive_list could be named simply intrusive_list, +// and inheritance based intrusive_list version would become its partial specialization. +// Here are the corresponding declarations: + +struct dummy_intrusive_list_item { intrusive_list_node my_node; }; + +template +class intrusive_list : public intrusive_list_base, T>; + +template +class intrusive_list + : public intrusive_list_base, T>; + +#endif /* __TBB_TODO */ + +//! Double linked list of items of type T containing a member of type intrusive_list_node. +/** NodePtr is a member pointer to the node data field. Class U is either T or + a base class of T containing the node member. Default values exist for the sake + of a partial specialization working with inheritance case. + + The list does not have ownership of its items. Its purpose is to avoid dynamic + memory allocation when forming lists of existing objects. + + The class is not thread safe. **/ +template +class memptr_intrusive_list : public intrusive_list_base, T> +{ + friend class intrusive_list_base, T>; + + static intrusive_list_node& node ( T& val ) { return val.*NodePtr; } + + static T& item ( intrusive_list_node* node ) { + // Cannot use __TBB_offsetof (and consequently __TBB_get_object_ref) macro + // with *NodePtr argument because gcc refuses to interpret pasted "->" and "*" + // as member pointer dereferencing operator, and explicit usage of ## in + // __TBB_offsetof implementation breaks operations with normal member names. + return *reinterpret_cast((char*)node - ((ptrdiff_t)&(reinterpret_cast(0x1000)->*NodePtr) - 0x1000)); + } + + static const T& item( const intrusive_list_node* node ) { + return item(const_cast(node)); + } + +}; // intrusive_list + +//! Double linked list of items of type T that is derived from intrusive_list_node class. +/** The list does not have ownership of its items. Its purpose is to avoid dynamic + memory allocation when forming lists of existing objects. + + The class is not thread safe. **/ +template +class intrusive_list : public intrusive_list_base, T> +{ + friend class intrusive_list_base, T>; + + static intrusive_list_node& node ( T& val ) { return val; } + + static T& item ( intrusive_list_node* node ) { return *static_cast(node); } + + static const T& item( const intrusive_list_node* node ) { return *static_cast(node); } +}; // intrusive_list + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* _TBB_intrusive_list_H */ diff --git a/third_party/tbb/itt_notify.cpp b/third_party/tbb/itt_notify.cpp new file mode 100644 index 000000000..fe9325490 --- /dev/null +++ b/third_party/tbb/itt_notify.cpp @@ -0,0 +1,70 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#if __TBB_USE_ITT_NOTIFY + +#if _WIN32||_WIN64 + #ifndef UNICODE + #define UNICODE + #endif +#else + #pragma weak dlopen + #pragma weak dlsym + #pragma weak dlerror +#endif /* WIN */ + +#if __TBB_BUILD + +extern "C" void ITT_DoOneTimeInitialization(); +#define __itt_init_ittlib_name(x,y) (ITT_DoOneTimeInitialization(), true) + +#elif __TBBMALLOC_BUILD + +extern "C" void MallocInitializeITT(); +#define __itt_init_ittlib_name(x,y) (MallocInitializeITT(), true) + +#else +#error This file is expected to be used for either TBB or TBB allocator build. +#endif // __TBB_BUILD + +// MISSING #include "tools_api/ittnotify_static.c" + +namespace tbb { +namespace detail { +namespace r1 { + +/** This extra proxy method is necessary since __itt_init_lib is declared as static **/ +int __TBB_load_ittnotify() { +#if !(_WIN32||_WIN64) + // tool_api crashes without dlopen, check that it's present. Common case + // for lack of dlopen is static binaries, i.e. ones build with -static. + if (dlopen == nullptr) + return 0; +#endif + return __itt_init_ittlib(nullptr, // groups for: + (__itt_group_id)(__itt_group_sync // prepare/cancel/acquired/releasing + | __itt_group_thread // name threads + | __itt_group_stitch // stack stitching + | __itt_group_structure + )); +} + +} //namespace r1 +} //namespace detail +} // namespace tbb + +#endif /* __TBB_USE_ITT_NOTIFY */ diff --git a/third_party/tbb/itt_notify.h b/third_party/tbb/itt_notify.h new file mode 100644 index 000000000..eba910d27 --- /dev/null +++ b/third_party/tbb/itt_notify.h @@ -0,0 +1,118 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_ITT_NOTIFY +#define _TBB_ITT_NOTIFY + +#include "third_party/tbb/detail/_config.h" + +#if __TBB_USE_ITT_NOTIFY + +#if _WIN32||_WIN64 + #ifndef UNICODE + #define UNICODE + #endif +#endif /* WIN */ + +#ifndef INTEL_ITTNOTIFY_API_PRIVATE +#define INTEL_ITTNOTIFY_API_PRIVATE +#endif + +// MISSING #include "tools_api/ittnotify.h" +// MISSING #include "tools_api/legacy/ittnotify.h" +extern "C" void __itt_fini_ittlib(void); +extern "C" void __itt_release_resources(void); + +#if _WIN32||_WIN64 + #undef _T +#endif /* WIN */ + +#endif /* __TBB_USE_ITT_NOTIFY */ + +#if !ITT_CALLER_NULL +#define ITT_CALLER_NULL ((__itt_caller)0) +#endif + +namespace tbb { +namespace detail { +namespace r1 { + +//! Unicode support +#if (_WIN32||_WIN64) + //! Unicode character type. Always wchar_t on Windows. + /** We do not use typedefs from Windows TCHAR family to keep consistence of TBB coding style. **/ + using tchar = wchar_t; + //! Standard Windows macro to markup the string literals. + #define _T(string_literal) L ## string_literal +#else /* !WIN */ + using tchar = char; + //! Standard Windows style macro to markup the string literals. + #define _T(string_literal) string_literal +#endif /* !WIN */ + +//! Display names of internal synchronization types +extern const tchar + *SyncType_Scheduler; +//! Display names of internal synchronization components/scenarios +extern const tchar + *SyncObj_ContextsList + ; + +#if __TBB_USE_ITT_NOTIFY +// const_cast() is necessary to cast off volatility +#define ITT_NOTIFY(name,obj) __itt_##name(const_cast(static_cast(obj))) +#define ITT_THREAD_SET_NAME(name) __itt_thread_set_name(name) +#define ITT_FINI_ITTLIB() __itt_fini_ittlib() +#define ITT_RELEASE_RESOURCES() __itt_release_resources() +#define ITT_SYNC_CREATE(obj, type, name) __itt_sync_create((void*)(obj), type, name, 2) +#define ITT_STACK_CREATE(obj) obj = __itt_stack_caller_create() +#define ITT_STACK_DESTROY(obj) (obj!=nullptr) ? __itt_stack_caller_destroy(static_cast<__itt_caller>(obj)) : ((void)0) +#define ITT_CALLEE_ENTER(cond, t, obj) if(cond) {\ + __itt_stack_callee_enter(static_cast<__itt_caller>(obj));\ + __itt_sync_acquired(t);\ + } +#define ITT_CALLEE_LEAVE(cond, obj) (cond) ? __itt_stack_callee_leave(static_cast<__itt_caller>(obj)) : ((void)0) + +#define ITT_TASK_GROUP(obj,name,parent) r1::itt_make_task_group(d1::ITT_DOMAIN_MAIN,(void*)(obj),ALGORITHM,(void*)(parent),(parent!=nullptr) ? ALGORITHM : FLOW_NULL,name) +#define ITT_TASK_BEGIN(obj,name,id) r1::itt_task_begin(d1::ITT_DOMAIN_MAIN,(void*)(id),ALGORITHM,(void*)(obj),ALGORITHM,name) +#define ITT_TASK_END r1::itt_task_end(d1::ITT_DOMAIN_MAIN) + + +#else /* !__TBB_USE_ITT_NOTIFY */ + +#define ITT_NOTIFY(name,obj) ((void)0) +#define ITT_THREAD_SET_NAME(name) ((void)0) +#define ITT_FINI_ITTLIB() ((void)0) +#define ITT_RELEASE_RESOURCES() ((void)0) +#define ITT_SYNC_CREATE(obj, type, name) ((void)0) +#define ITT_STACK_CREATE(obj) ((void)0) +#define ITT_STACK_DESTROY(obj) ((void)0) +#define ITT_CALLEE_ENTER(cond, t, obj) ((void)0) +#define ITT_CALLEE_LEAVE(cond, obj) ((void)0) +#define ITT_TASK_GROUP(type,name,parent) ((void)0) +#define ITT_TASK_BEGIN(type,name,id) ((void)0) +#define ITT_TASK_END ((void)0) + +#endif /* !__TBB_USE_ITT_NOTIFY */ + +int __TBB_load_ittnotify(); + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* _TBB_ITT_NOTIFY */ diff --git a/third_party/tbb/mailbox.h b/third_party/tbb/mailbox.h new file mode 100644 index 000000000..3cb2f0646 --- /dev/null +++ b/third_party/tbb/mailbox.h @@ -0,0 +1,247 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_mailbox_H +#define _TBB_mailbox_H + +#include "third_party/tbb/cache_aligned_allocator.h" +#include "third_party/tbb/detail/_small_object_pool.h" + +#include "third_party/tbb/scheduler_common.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace r1 { + +struct task_proxy : public d1::task { + static const intptr_t pool_bit = 1<<0; + static const intptr_t mailbox_bit = 1<<1; + static const intptr_t location_mask = pool_bit | mailbox_bit; + /* All but two low-order bits represent a (task*). + Two low-order bits mean: + 1 = proxy is/was/will be in task pool + 2 = proxy is/was/will be in mailbox */ + std::atomic task_and_tag; + + //! Pointer to next task_proxy in a mailbox + std::atomic next_in_mailbox; + + //! Mailbox to which this was mailed. + mail_outbox* outbox; + + //! Task affinity id which is referenced + d1::slot_id slot; + + d1::small_object_allocator allocator; + + //! True if the proxy is stored both in its sender's pool and in the destination mailbox. + static bool is_shared ( intptr_t tat ) { + return (tat & location_mask) == location_mask; + } + + //! Returns a pointer to the encapsulated task or nullptr. + static task* task_ptr ( intptr_t tat ) { + return (task*)(tat & ~location_mask); + } + + //! Returns a pointer to the encapsulated task or nullptr, and frees proxy if necessary. + template + inline task* extract_task () { + // __TBB_ASSERT( prefix().extra_state == es_task_proxy, "Normal task misinterpreted as a proxy?" ); + intptr_t tat = task_and_tag.load(std::memory_order_acquire); + __TBB_ASSERT( tat == from_bit || (is_shared(tat) && task_ptr(tat)), + "Proxy's tag cannot specify both locations if the proxy " + "was retrieved from one of its original locations" ); + if ( tat != from_bit ) { + const intptr_t cleaner_bit = location_mask & ~from_bit; + // Attempt to transition the proxy to the "empty" state with + // cleaner_bit specifying entity responsible for its eventual freeing. + // Explicit cast to void* is to work around a seeming ICC 11.1 bug. + if ( task_and_tag.compare_exchange_strong(tat, cleaner_bit) ) { + // Successfully grabbed the task, and left new owner with the job of freeing the proxy + return task_ptr(tat); + } + } + // Proxied task has already been claimed from another proxy location. + __TBB_ASSERT( task_and_tag.load(std::memory_order_relaxed) == from_bit, "Empty proxy cannot contain non-zero task pointer" ); + return nullptr; + } + + task* execute(d1::execution_data&) override { + __TBB_ASSERT_RELEASE(false, nullptr); + return nullptr; + } + task* cancel(d1::execution_data&) override { + __TBB_ASSERT_RELEASE(false, nullptr); + return nullptr; + } +}; // struct task_proxy + +//! Internal representation of mail_outbox, without padding. +class unpadded_mail_outbox { +protected: + typedef std::atomic atomic_proxy_ptr; + + //! Pointer to first task_proxy in mailbox, or nullptr if box is empty. + atomic_proxy_ptr my_first; + + //! Pointer to pointer that will point to next item in the queue. Never nullptr. + std::atomic my_last; + + //! Owner of mailbox is not executing a task, and has drained its own task pool. + std::atomic my_is_idle; +}; + +// TODO: - consider moving to arena slot +//! Class representing where mail is put. +/** Padded to occupy a cache line. */ +class mail_outbox : padded { + + task_proxy* internal_pop( isolation_type isolation ) { + task_proxy* curr = my_first.load(std::memory_order_acquire); + if ( !curr ) + return nullptr; + atomic_proxy_ptr* prev_ptr = &my_first; + if ( isolation != no_isolation ) { + while ( task_accessor::isolation(*curr) != isolation ) { + prev_ptr = &curr->next_in_mailbox; + // The next_in_mailbox should be read with acquire to guarantee (*curr) consistency. + curr = curr->next_in_mailbox.load(std::memory_order_acquire); + if ( !curr ) + return nullptr; + } + } + // There is a first item in the mailbox. See if there is a second. + // The next_in_mailbox should be read with acquire to guarantee (*second) consistency. + if ( task_proxy* second = curr->next_in_mailbox.load(std::memory_order_acquire) ) { + // There are at least two items, so first item can be popped easily. + prev_ptr->store(second, std::memory_order_relaxed); + } else { + // There is only one item. Some care is required to pop it. + + prev_ptr->store(nullptr, std::memory_order_relaxed); + atomic_proxy_ptr* expected = &curr->next_in_mailbox; + if ( my_last.compare_exchange_strong( expected, prev_ptr ) ) { + // Successfully transitioned mailbox from having one item to having none. + __TBB_ASSERT( !curr->next_in_mailbox.load(std::memory_order_relaxed), nullptr); + } else { + // Some other thread updated my_last but has not filled in first->next_in_mailbox + // Wait until first item points to second item. + atomic_backoff backoff; + // The next_in_mailbox should be read with acquire to guarantee (*second) consistency. + while ( !(second = curr->next_in_mailbox.load(std::memory_order_acquire)) ) backoff.pause(); + prev_ptr->store( second, std::memory_order_relaxed); + } + } + assert_pointer_valid(curr); + return curr; + } +public: + friend class mail_inbox; + + //! Push task_proxy onto the mailbox queue of another thread. + /** Implementation is wait-free. */ + void push( task_proxy* t ) { + assert_pointer_valid(t); + t->next_in_mailbox.store(nullptr, std::memory_order_relaxed); + atomic_proxy_ptr* const link = my_last.exchange(&t->next_in_mailbox); + // Logically, the release fence is not required because the exchange above provides the + // release-acquire semantic that guarantees that (*t) will be consistent when another thread + // loads the link atomic. However, C++11 memory model guarantees consistency of(*t) only + // when the same atomic is used for synchronization. + link->store(t, std::memory_order_release); + } + + //! Return true if mailbox is empty + bool empty() { + return my_first.load(std::memory_order_relaxed) == nullptr; + } + + //! Construct *this as a mailbox from zeroed memory. + /** Raise assertion if *this is not previously zeroed, or sizeof(this) is wrong. + This method is provided instead of a full constructor since we know the object + will be constructed in zeroed memory. */ + void construct() { + __TBB_ASSERT( sizeof(*this)==max_nfs_size, nullptr ); + __TBB_ASSERT( !my_first.load(std::memory_order_relaxed), nullptr ); + __TBB_ASSERT( !my_last.load(std::memory_order_relaxed), nullptr ); + __TBB_ASSERT( !my_is_idle.load(std::memory_order_relaxed), nullptr ); + my_last = &my_first; + suppress_unused_warning(pad); + } + + //! Drain the mailbox + void drain() { + // No fences here because other threads have already quit. + for( ; task_proxy* t = my_first; ) { + my_first.store(t->next_in_mailbox, std::memory_order_relaxed); + t->allocator.delete_object(t); + } + } + + //! True if thread that owns this mailbox is looking for work. + bool recipient_is_idle() { + return my_is_idle.load(std::memory_order_relaxed); + } +}; // class mail_outbox + +//! Class representing source of mail. +class mail_inbox { + //! Corresponding sink where mail that we receive will be put. + mail_outbox* my_putter; +public: + //! Construct unattached inbox + mail_inbox() : my_putter(nullptr) {} + + //! Attach inbox to a corresponding outbox. + void attach( mail_outbox& putter ) { + my_putter = &putter; + } + //! Detach inbox from its outbox + void detach() { + __TBB_ASSERT(my_putter,"not attached"); + my_putter = nullptr; + } + //! Get next piece of mail, or nullptr if mailbox is empty. + task_proxy* pop( isolation_type isolation ) { + return my_putter->internal_pop( isolation ); + } + //! Return true if mailbox is empty + bool empty() { + return my_putter->empty(); + } + //! Indicate whether thread that reads this mailbox is idle. + /** Raises assertion failure if mailbox is redundantly marked as not idle. */ + void set_is_idle( bool value ) { + if( my_putter ) { + __TBB_ASSERT( my_putter->my_is_idle.load(std::memory_order_relaxed) || value, "attempt to redundantly mark mailbox as not idle" ); + my_putter->my_is_idle.store(value, std::memory_order_relaxed); + } + } + //! Indicate whether thread that reads this mailbox is idle. + bool is_idle_state ( bool value ) const { + return !my_putter || my_putter->my_is_idle.load(std::memory_order_relaxed) == value; + } +}; // class mail_inbox + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* _TBB_mailbox_H */ diff --git a/third_party/tbb/main.cpp b/third_party/tbb/main.cpp new file mode 100644 index 000000000..734913362 --- /dev/null +++ b/third_party/tbb/main.cpp @@ -0,0 +1,172 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/detail/_config.h" + +#include "third_party/tbb/main.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/threading_control.h" +#include "third_party/tbb/environment.h" +#include "third_party/tbb/market.h" +#include "third_party/tbb/misc.h" +#include "third_party/tbb/itt_notify.h" + +namespace tbb { +namespace detail { +namespace r1 { + +//------------------------------------------------------------------------ +// Begin shared data layout. +// The following global data items are mostly read-only after initialization. +//------------------------------------------------------------------------ + +//------------------------------------------------------------------------ +// governor data +basic_tls governor::theTLS; +rml::tbb_factory governor::theRMLServerFactory; +bool governor::UsePrivateRML; +bool governor::is_rethrow_broken; + +//------------------------------------------------------------------------ +// threading_control data +threading_control* threading_control::g_threading_control; +threading_control::global_mutex_type threading_control::g_threading_control_mutex; + +//------------------------------------------------------------------------ +// context propagation data +context_state_propagation_mutex_type the_context_state_propagation_mutex; +std::atomic the_context_state_propagation_epoch{}; + +//------------------------------------------------------------------------ +// One time initialization data + +//! Counter of references to global shared resources such as TLS. +std::atomic __TBB_InitOnce::count{}; + +std::atomic_flag __TBB_InitOnce::InitializationLock = ATOMIC_FLAG_INIT; + +//! Flag that is set to true after one-time initializations are done. +std::atomic __TBB_InitOnce::InitializationDone{}; + +#if __TBB_USE_ITT_NOTIFY +//! Defined in profiling.cpp +extern bool ITT_Present; +void ITT_DoUnsafeOneTimeInitialization(); +#endif + +#if !(_WIN32||_WIN64) || __TBB_SOURCE_DIRECTLY_INCLUDED +static __TBB_InitOnce __TBB_InitOnceHiddenInstance; +#endif + +#if TBB_USE_ASSERT +std::atomic the_observer_proxy_count; + +struct check_observer_proxy_count { + ~check_observer_proxy_count() { + if (the_observer_proxy_count != 0) { + runtime_warning("Leaked %ld observer_proxy objects\n", long(the_observer_proxy_count)); + } + } +}; +// The proxy count checker shall be defined after __TBB_InitOnceHiddenInstance to check the count +// after auto termination. +static check_observer_proxy_count the_check_observer_proxy_count; +#endif /* TBB_USE_ASSERT */ + +//------------------------------------------------------------------------ +// __TBB_InitOnce +//------------------------------------------------------------------------ + +void __TBB_InitOnce::add_ref() { + if( ++count==1 ) + governor::acquire_resources(); +} + +void __TBB_InitOnce::remove_ref() { + int k = --count; + __TBB_ASSERT(k>=0,"removed __TBB_InitOnce ref that was not added?"); + if( k==0 ) { + governor::release_resources(); + ITT_FINI_ITTLIB(); + ITT_RELEASE_RESOURCES(); + } +} + +//------------------------------------------------------------------------ +// One-time Initializations +//------------------------------------------------------------------------ + +//! Defined in cache_aligned_allocator.cpp +void initialize_cache_aligned_allocator(); + +//! Performs thread-safe lazy one-time general TBB initialization. +void DoOneTimeInitialization() { + __TBB_InitOnce::lock(); + // No fence required for load of InitializationDone, because we are inside a critical section. + if( !__TBB_InitOnce::InitializationDone ) { + __TBB_InitOnce::add_ref(); + if( GetBoolEnvironmentVariable("TBB_VERSION") ) + PrintVersion(); + bool itt_present = false; +#if __TBB_USE_ITT_NOTIFY + ITT_DoUnsafeOneTimeInitialization(); + itt_present = ITT_Present; +#endif /* __TBB_USE_ITT_NOTIFY */ + initialize_cache_aligned_allocator(); + governor::initialize_rml_factory(); + // Force processor groups support detection + governor::default_num_threads(); + // Force OS regular page size detection + governor::default_page_size(); + PrintExtraVersionInfo( "TOOLS SUPPORT", itt_present ? "enabled" : "disabled" ); + __TBB_InitOnce::InitializationDone = true; + } + __TBB_InitOnce::unlock(); +} + +#if (_WIN32||_WIN64) && !__TBB_SOURCE_DIRECTLY_INCLUDED +//! Windows "DllMain" that handles startup and shutdown of dynamic library. +extern "C" bool WINAPI DllMain( HANDLE /*hinstDLL*/, DWORD reason, LPVOID lpvReserved ) { + switch( reason ) { + case DLL_PROCESS_ATTACH: + __TBB_InitOnce::add_ref(); + break; + case DLL_PROCESS_DETACH: + // Since THREAD_DETACH is not called for the main thread, call auto-termination + // here as well - but not during process shutdown (due to risk of a deadlock). + if ( lpvReserved == nullptr ) { // library unload + governor::terminate_external_thread(); + } + __TBB_InitOnce::remove_ref(); + // It is assumed that InitializationDone is not set after DLL_PROCESS_DETACH, + // and thus no race on InitializationDone is possible. + if ( __TBB_InitOnce::initialization_done() ) { + // Remove reference that we added in DoOneTimeInitialization. + __TBB_InitOnce::remove_ref(); + } + break; + case DLL_THREAD_DETACH: + governor::terminate_external_thread(); + break; + } + return true; +} +#endif /* (_WIN32||_WIN64) && !__TBB_SOURCE_DIRECTLY_INCLUDED */ + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/main.h b/third_party/tbb/main.h new file mode 100644 index 000000000..c23f34bc5 --- /dev/null +++ b/third_party/tbb/main.h @@ -0,0 +1,100 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_main_H +#define _TBB_main_H + +#include "third_party/tbb/governor.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace r1 { + +void DoOneTimeInitialization(); + +//------------------------------------------------------------------------ +// __TBB_InitOnce +//------------------------------------------------------------------------ + +// TODO (TBB_REVAMP_TODO): consider better names +//! Class that supports TBB initialization. +/** It handles acquisition and release of global resources (e.g. TLS) during startup and shutdown, + as well as synchronization for DoOneTimeInitialization. */ +class __TBB_InitOnce { + friend void DoOneTimeInitialization(); + friend void ITT_DoUnsafeOneTimeInitialization(); + + static std::atomic count; + + //! Platform specific code to acquire resources. + static void acquire_resources(); + + //! Platform specific code to release resources. + static void release_resources(); + + //! Specifies if the one-time initializations has been done. + static std::atomic InitializationDone; + + //! Global initialization lock + /** Scenarios are possible when tools interop has to be initialized before the + TBB itself. This imposes a requirement that the global initialization lock + has to support valid static initialization, and does not issue any tool + notifications in any build mode. **/ + static std::atomic_flag InitializationLock; + +public: + static void lock() { + tbb::detail::atomic_backoff backoff; + while( InitializationLock.test_and_set() ) backoff.pause(); + } + + static void unlock() { InitializationLock.clear(std::memory_order_release); } + + static bool initialization_done() { return InitializationDone.load(std::memory_order_acquire); } + + //! Add initial reference to resources. + /** We assume that dynamic loading of the library prevents any other threads + from entering the library until this constructor has finished running. **/ + __TBB_InitOnce() { add_ref(); } + + //! Remove the initial reference to resources. + /** This is not necessarily the last reference if other threads are still running. **/ + ~__TBB_InitOnce() { + governor::terminate_external_thread(); // TLS dtor not called for the main thread + remove_ref(); + // We assume that InitializationDone is not set after file-scope destructors + // start running, and thus no race on InitializationDone is possible. + if ( initialization_done() ) { + // Remove an extra reference that was added in DoOneTimeInitialization. + remove_ref(); + } + } + //! Add reference to resources. If first reference added, acquire the resources. + static void add_ref(); + + //! Remove reference to resources. If last reference removed, release the resources. + static void remove_ref(); + +}; // class __TBB_InitOnce + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* _TBB_main_H */ diff --git a/third_party/tbb/market.cpp b/third_party/tbb/market.cpp new file mode 100644 index 000000000..80a22b960 --- /dev/null +++ b/third_party/tbb/market.cpp @@ -0,0 +1,140 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/arena.h" +#include "third_party/tbb/market.h" + +#include "third_party/libcxx/algorithm" // std::find + +namespace tbb { +namespace detail { +namespace r1 { + + +class tbb_permit_manager_client : public pm_client { +public: + tbb_permit_manager_client(arena& a) : pm_client(a) {} + + void set_allotment(unsigned allotment) { + my_arena.set_allotment(allotment); + } +}; + +//------------------------------------------------------------------------ +// market +//------------------------------------------------------------------------ + +market::market(unsigned workers_soft_limit) + : my_num_workers_soft_limit(workers_soft_limit) +{} + +pm_client* market::create_client(arena& a) { + return new (cache_aligned_allocate(sizeof(tbb_permit_manager_client))) tbb_permit_manager_client(a); +} + +void market::register_client(pm_client* c) { + mutex_type::scoped_lock lock(my_mutex); + my_clients[c->priority_level()].push_back(c); +} + +void market::unregister_and_destroy_client(pm_client& c) { + { + mutex_type::scoped_lock lock(my_mutex); + auto& clients = my_clients[c.priority_level()]; + auto it = std::find(clients.begin(), clients.end(), &c); + __TBB_ASSERT(it != clients.end(), "Destroying of an unregistered client"); + clients.erase(it); + } + + auto client = static_cast(&c); + client->~tbb_permit_manager_client(); + cache_aligned_deallocate(client); +} + +void market::update_allotment() { + int effective_soft_limit = my_mandatory_num_requested > 0 && my_num_workers_soft_limit == 0 ? 1 : my_num_workers_soft_limit; + int max_workers = min(my_total_demand, effective_soft_limit); + __TBB_ASSERT(max_workers >= 0, nullptr); + + int unassigned_workers = max_workers; + int assigned = 0; + int carry = 0; + unsigned max_priority_level = num_priority_levels; + for (unsigned list_idx = 0; list_idx < num_priority_levels; ++list_idx ) { + int assigned_per_priority = min(my_priority_level_demand[list_idx], unassigned_workers); + unassigned_workers -= assigned_per_priority; + // We use reverse iterator there to serve last added clients first + for (auto it = my_clients[list_idx].rbegin(); it != my_clients[list_idx].rend(); ++it) { + tbb_permit_manager_client& client = static_cast(**it); + if (client.max_workers() == 0) { + client.set_allotment(0); + continue; + } + + if (max_priority_level == num_priority_levels) { + max_priority_level = list_idx; + } + + int allotted = 0; + if (my_num_workers_soft_limit == 0) { + __TBB_ASSERT(max_workers == 0 || max_workers == 1, nullptr); + allotted = client.min_workers() > 0 && assigned < max_workers ? 1 : 0; + } else { + int tmp = client.max_workers() * assigned_per_priority + carry; + allotted = tmp / my_priority_level_demand[list_idx]; + carry = tmp % my_priority_level_demand[list_idx]; + __TBB_ASSERT(allotted <= client.max_workers(), nullptr); + } + client.set_allotment(allotted); + client.set_top_priority(list_idx == max_priority_level); + assigned += allotted; + } + } + __TBB_ASSERT(assigned == max_workers, nullptr); +} + +void market::set_active_num_workers(int soft_limit) { + mutex_type::scoped_lock lock(my_mutex); + if (my_num_workers_soft_limit != soft_limit) { + my_num_workers_soft_limit = soft_limit; + update_allotment(); + } +} + +void market::adjust_demand(pm_client& c, int mandatory_delta, int workers_delta) { + __TBB_ASSERT(-1 <= mandatory_delta && mandatory_delta <= 1, nullptr); + + int delta{}; + { + mutex_type::scoped_lock lock(my_mutex); + // Update client's state + delta = c.update_request(mandatory_delta, workers_delta); + + // Update market's state + my_total_demand += delta; + my_priority_level_demand[c.priority_level()] += delta; + my_mandatory_num_requested += mandatory_delta; + + update_allotment(); + } + + notify_thread_request(delta); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/market.h b/third_party/tbb/market.h new file mode 100644 index 000000000..0367a19c4 --- /dev/null +++ b/third_party/tbb/market.h @@ -0,0 +1,79 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_market_H +#define _TBB_market_H + +#include "third_party/tbb/rw_mutex.h" +#include "third_party/tbb/tbb_allocator.h" +#include "third_party/tbb/task_arena.h" + +#include "third_party/tbb/permit_manager.h" +#include "third_party/tbb/pm_client.h" + +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/vector" + +namespace tbb { +namespace detail { +namespace r1 { + +class market : public permit_manager { +public: + market(unsigned soft_limit); + + pm_client* create_client(arena& a) override; + void register_client(pm_client* client) override; + void unregister_and_destroy_client(pm_client& c) override; + + //! Request that arena's need in workers should be adjusted. + void adjust_demand(pm_client&, int mandatory_delta, int workers_delta) override; + + //! Set number of active workers + void set_active_num_workers(int soft_limit) override; +private: + //! Recalculates the number of workers assigned to each arena in the list. + void update_allotment(); + + //! Keys for the arena map array. The lower the value the higher priority of the arena list. + static constexpr unsigned num_priority_levels = d1::num_priority_levels; + + using mutex_type = d1::rw_mutex; + mutex_type my_mutex; + + //! Current application-imposed limit on the number of workers + int my_num_workers_soft_limit; + + //! Number of workers that were requested by all arenas on all priority levels + int my_total_demand{0}; + + //! Number of workers that were requested by arenas per single priority list item + int my_priority_level_demand[num_priority_levels] = {0}; + + //! How many times mandatory concurrency was requested from the market + int my_mandatory_num_requested{0}; + + //! Per priority list of registered arenas + using clients_container_type = std::vector>; + clients_container_type my_clients[num_priority_levels]; +}; // class market + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* _TBB_market_H */ diff --git a/third_party/tbb/memory_pool.h b/third_party/tbb/memory_pool.h new file mode 100644 index 000000000..14f2393e0 --- /dev/null +++ b/third_party/tbb/memory_pool.h @@ -0,0 +1,273 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_memory_pool_H +#define __TBB_memory_pool_H + +#if !TBB_PREVIEW_MEMORY_POOL +#error Set TBB_PREVIEW_MEMORY_POOL to include memory_pool.h +#endif +/** @file */ + +#include "third_party/tbb/scalable_allocator.h" + +#include "third_party/libcxx/new" // std::bad_alloc +#include "third_party/libcxx/stdexcept" // std::runtime_error, std::invalid_argument +#include "third_party/libcxx/utility" // std::forward + + +#if __TBB_EXTRA_DEBUG +#define __TBBMALLOC_ASSERT ASSERT +#else +#define __TBBMALLOC_ASSERT(a,b) ((void)0) +#endif + +namespace tbb { +namespace detail { +namespace d1 { + +//! Base of thread-safe pool allocator for variable-size requests +class pool_base : no_copy { + // Pool interface is separate from standard allocator classes because it has + // to maintain internal state, no copy or assignment. Move and swap are possible. +public: + //! Reset pool to reuse its memory (free all objects at once) + void recycle() { rml::pool_reset(my_pool); } + + //! The "malloc" analogue to allocate block of memory of size bytes + void *malloc(size_t size) { return rml::pool_malloc(my_pool, size); } + + //! The "free" analogue to discard a previously allocated piece of memory. + void free(void* ptr) { rml::pool_free(my_pool, ptr); } + + //! The "realloc" analogue complementing pool_malloc. + // Enables some low-level optimization possibilities + void *realloc(void* ptr, size_t size) { + return rml::pool_realloc(my_pool, ptr, size); + } + +protected: + //! destroy pool - must be called in a child class + void destroy() { rml::pool_destroy(my_pool); } + + rml::MemoryPool *my_pool; +}; + +#if _MSC_VER && !defined(__INTEL_COMPILER) + // Workaround for erroneous "unreferenced parameter" warning in method destroy. + #pragma warning (push) + #pragma warning (disable: 4100) +#endif + +//! Meets "allocator" requirements of ISO C++ Standard, Section 20.1.5 +/** @ingroup memory_allocation */ +template +class memory_pool_allocator { +protected: + typedef P pool_type; + pool_type *my_pool; + template + friend class memory_pool_allocator; + template + friend bool operator==( const memory_pool_allocator& a, const memory_pool_allocator& b); + template + friend bool operator!=( const memory_pool_allocator& a, const memory_pool_allocator& b); +public: + typedef T value_type; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + template struct rebind { + typedef memory_pool_allocator other; + }; + + explicit memory_pool_allocator(pool_type &pool) throw() : my_pool(&pool) {} + memory_pool_allocator(const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {} + template + memory_pool_allocator(const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {} + + pointer address(reference x) const { return &x; } + const_pointer address(const_reference x) const { return &x; } + + //! Allocate space for n objects. + pointer allocate( size_type n, const void* /*hint*/ = nullptr) { + pointer p = static_cast( my_pool->malloc( n*sizeof(value_type) ) ); + if (!p) + throw_exception(std::bad_alloc()); + return p; + } + //! Free previously allocated block of memory. + void deallocate( pointer p, size_type ) { + my_pool->free(p); + } + //! Largest value for which method allocate might succeed. + size_type max_size() const throw() { + size_type max = static_cast(-1) / sizeof (value_type); + return (max > 0 ? max : 1); + } + //! Copy-construct value at location pointed to by p. + + template + void construct(U *p, Args&&... args) + { ::new((void *)p) U(std::forward(args)...); } + + //! Destroy value at location pointed to by p. + void destroy( pointer p ) { p->~value_type(); } + +}; + +#if _MSC_VER && !defined(__INTEL_COMPILER) + #pragma warning (pop) +#endif // warning 4100 is back + +//! Analogous to std::allocator, as defined in ISO C++ Standard, Section 20.4.1 +/** @ingroup memory_allocation */ +template +class memory_pool_allocator { +public: + typedef P pool_type; + typedef void* pointer; + typedef const void* const_pointer; + typedef void value_type; + template struct rebind { + typedef memory_pool_allocator other; + }; + + explicit memory_pool_allocator( pool_type &pool) throw() : my_pool(&pool) {} + memory_pool_allocator( const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {} + template + memory_pool_allocator(const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {} + +protected: + pool_type *my_pool; + template + friend class memory_pool_allocator; + template + friend bool operator==( const memory_pool_allocator& a, const memory_pool_allocator& b); + template + friend bool operator!=( const memory_pool_allocator& a, const memory_pool_allocator& b); +}; + +template +inline bool operator==( const memory_pool_allocator& a, const memory_pool_allocator& b) {return a.my_pool==b.my_pool;} + +template +inline bool operator!=( const memory_pool_allocator& a, const memory_pool_allocator& b) {return a.my_pool!=b.my_pool;} + +//! Thread-safe growable pool allocator for variable-size requests +template +class memory_pool : public pool_base { + Alloc my_alloc; // TODO: base-class optimization + static void *allocate_request(intptr_t pool_id, size_t & bytes); + static int deallocate_request(intptr_t pool_id, void*, size_t raw_bytes); + +public: + //! construct pool with underlying allocator + explicit memory_pool(const Alloc &src = Alloc()); + + //! destroy pool + ~memory_pool() { destroy(); } // call the callbacks first and destroy my_alloc latter +}; + +class fixed_pool : public pool_base { + void *my_buffer; + size_t my_size; + inline static void *allocate_request(intptr_t pool_id, size_t & bytes); + +public: + //! construct pool with underlying allocator + inline fixed_pool(void *buf, size_t size); + //! destroy pool + ~fixed_pool() { destroy(); } +}; + +//////////////// Implementation /////////////// + +template +memory_pool::memory_pool(const Alloc &src) : my_alloc(src) { + rml::MemPoolPolicy args(allocate_request, deallocate_request, + sizeof(typename Alloc::value_type)); + rml::MemPoolError res = rml::pool_create_v1(intptr_t(this), &args, &my_pool); + if (res!=rml::POOL_OK) + throw_exception(std::runtime_error("Can't create pool")); +} +template +void *memory_pool::allocate_request(intptr_t pool_id, size_t & bytes) { + memory_pool &self = *reinterpret_cast*>(pool_id); + const size_t unit_size = sizeof(typename Alloc::value_type); + __TBBMALLOC_ASSERT( 0 == bytes%unit_size, nullptr); + void *ptr; +#if TBB_USE_EXCEPTIONS + try { +#endif + ptr = self.my_alloc.allocate( bytes/unit_size ); +#if TBB_USE_EXCEPTIONS + } catch(...) { + return nullptr; + } +#endif + return ptr; +} +#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED + // Workaround for erroneous "unreachable code" warning in the template below. + // Specific for VC++ 17-18 compiler + #pragma warning (push) + #pragma warning (disable: 4702) +#endif +template +int memory_pool::deallocate_request(intptr_t pool_id, void* raw_ptr, size_t raw_bytes) { + memory_pool &self = *reinterpret_cast*>(pool_id); + const size_t unit_size = sizeof(typename Alloc::value_type); + __TBBMALLOC_ASSERT( 0 == raw_bytes%unit_size, nullptr); + self.my_alloc.deallocate( static_cast(raw_ptr), raw_bytes/unit_size ); + return 0; +} +#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED + #pragma warning (pop) +#endif +inline fixed_pool::fixed_pool(void *buf, size_t size) : my_buffer(buf), my_size(size) { + if (!buf || !size) + // TODO: improve support for mode with exceptions disabled + throw_exception(std::invalid_argument("Zero in parameter is invalid")); + rml::MemPoolPolicy args(allocate_request, nullptr, size, /*fixedPool=*/true); + rml::MemPoolError res = rml::pool_create_v1(intptr_t(this), &args, &my_pool); + if (res!=rml::POOL_OK) + throw_exception(std::runtime_error("Can't create pool")); +} +inline void *fixed_pool::allocate_request(intptr_t pool_id, size_t & bytes) { + fixed_pool &self = *reinterpret_cast(pool_id); + __TBBMALLOC_ASSERT(0 != self.my_size, "The buffer must not be used twice."); + bytes = self.my_size; + self.my_size = 0; // remember that buffer has been used + return self.my_buffer; +} + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::memory_pool_allocator; +using detail::d1::memory_pool; +using detail::d1::fixed_pool; +} // inline namepspace v1 +} // namespace tbb + +#undef __TBBMALLOC_ASSERT +#endif// __TBB_memory_pool_H diff --git a/third_party/tbb/misc.cpp b/third_party/tbb/misc.cpp new file mode 100644 index 000000000..5358252b8 --- /dev/null +++ b/third_party/tbb/misc.cpp @@ -0,0 +1,176 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Source file for miscellaneous entities that are infrequently referenced by +// an executing program. + +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_machine.h" + +#include "third_party/tbb/version.h" + +#include "third_party/tbb/misc.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here. +#include "third_party/tbb/concurrent_monitor_mutex.h" + +#include "third_party/libcxx/cstdio" +#include "third_party/libcxx/cstdlib" +#include "third_party/libcxx/stdexcept" +#include "third_party/libcxx/cstring" +#include "third_party/libcxx/cstdarg" + +#if _WIN32||_WIN64 +#include "libc/nt/accounting.h" +#include "libc/nt/automation.h" +#include "libc/nt/console.h" +#include "libc/nt/debug.h" +#include "libc/nt/dll.h" +#include "libc/nt/enum/keyaccess.h" +#include "libc/nt/enum/regtype.h" +#include "libc/nt/errors.h" +#include "libc/nt/events.h" +#include "libc/nt/files.h" +#include "libc/nt/ipc.h" +#include "libc/nt/memory.h" +#include "libc/nt/paint.h" +#include "libc/nt/process.h" +#include "libc/nt/registry.h" +#include "libc/nt/synchronization.h" +#include "libc/nt/thread.h" +#include "libc/nt/windows.h" +#include "libc/nt/winsock.h" +#endif + +#if !_WIN32 +#include "libc/calls/calls.h" +#include "libc/calls/weirdtypes.h" +#include "libc/runtime/pathconf.h" +#include "libc/runtime/runtime.h" +#include "libc/runtime/sysconf.h" +#include "libc/sysv/consts/f.h" +#include "libc/sysv/consts/fileno.h" +#include "libc/sysv/consts/o.h" +#include "libc/sysv/consts/ok.h" +#include "libc/time/time.h" +#include "third_party/getopt/getopt.h" +#include "third_party/musl/crypt.h" +#include "third_party/musl/lockf.h" // sysconf(_SC_PAGESIZE) +#endif + +namespace tbb { +namespace detail { +namespace r1 { + +//------------------------------------------------------------------------ +// governor data +//------------------------------------------------------------------------ +cpu_features_type governor::cpu_features; + +//------------------------------------------------------------------------ +// concurrent_monitor_mutex data +//------------------------------------------------------------------------ +#if !__TBB_USE_FUTEX +std::mutex concurrent_monitor_mutex::my_init_mutex; +#endif + + +size_t DefaultSystemPageSize() { +#if _WIN32 + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwPageSize; +#else + return sysconf(_SC_PAGESIZE); +#endif +} + +/** The leading "\0" is here so that applying "strings" to the binary delivers a clean result. */ +static const char VersionString[] = "\0" TBB_VERSION_STRINGS; + +static bool PrintVersionFlag = false; + +void PrintVersion() { + PrintVersionFlag = true; + std::fputs(VersionString+1,stderr); +} + +void PrintExtraVersionInfo( const char* category, const char* format, ... ) { + if( PrintVersionFlag ) { + char str[1024]; std::memset(str, 0, 1024); + va_list args; va_start(args, format); + // Note: correct vsnprintf definition obtained from tbb_assert_impl.h + std::vsnprintf( str, 1024-1, format, args); + va_end(args); + std::fprintf(stderr, "oneTBB: %s\t%s\n", category, str ); + } +} + +//! check for transaction support. +#if _MSC_VER +// MISSING #include // for __cpuid +#endif + +#if __TBB_x86_32 || __TBB_x86_64 +void check_cpuid(int leaf, int sub_leaf, int registers[4]) { +#if _MSC_VER + __cpuidex(registers, leaf, sub_leaf); +#else + int reg_eax = 0; + int reg_ebx = 0; + int reg_ecx = 0; + int reg_edx = 0; +#if __TBB_x86_32 && __PIC__ + // On 32-bit systems with position-independent code GCC fails to work around the stuff in EBX + // register. We help it using backup and restore. + __asm__("mov %%ebx, %%esi\n\t" + "cpuid\n\t" + "xchg %%ebx, %%esi" + : "=a"(reg_eax), "=S"(reg_ebx), "=c"(reg_ecx), "=d"(reg_edx) + : "0"(leaf), "2"(sub_leaf) // read value from eax and ecx + ); +#else + __asm__("cpuid" + : "=a"(reg_eax), "=b"(reg_ebx), "=c"(reg_ecx), "=d"(reg_edx) + : "0"(leaf), "2"(sub_leaf) // read value from eax and ecx + ); +#endif + registers[0] = reg_eax; + registers[1] = reg_ebx; + registers[2] = reg_ecx; + registers[3] = reg_edx; +#endif +} +#endif + +void detect_cpu_features(cpu_features_type& cpu_features) { + suppress_unused_warning(cpu_features); +#if __TBB_x86_32 || __TBB_x86_64 + const int rtm_ebx_mask = 1 << 11; + const int waitpkg_ecx_mask = 1 << 5; + int registers[4] = {0}; + + // Check RTM and WAITPKG + check_cpuid(7, 0, registers); + cpu_features.rtm_enabled = (registers[1] & rtm_ebx_mask) != 0; + cpu_features.waitpkg_enabled = (registers[2] & waitpkg_ecx_mask) != 0; +#endif /* (__TBB_x86_32 || __TBB_x86_64) */ +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/misc.h b/third_party/tbb/misc.h new file mode 100644 index 000000000..7f8ab5038 --- /dev/null +++ b/third_party/tbb/misc.h @@ -0,0 +1,298 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_tbb_misc_H +#define _TBB_tbb_misc_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_utils.h" + +#if __TBB_ARENA_BINDING +#include "third_party/tbb/info.h" +#endif /*__TBB_ARENA_BINDING*/ + +#if __unix__ +#include "libc/intrin/newbie.h" +#include "libc/calls/calls.h" +#include "libc/calls/struct/rlimit.h" +#include "libc/calls/struct/rusage.h" +#include "libc/calls/sysparam.h" +#include "libc/calls/weirdtypes.h" +#include "libc/limits.h" +#include "libc/sysv/consts/endian.h" +#include "libc/sysv/consts/prio.h" +#include "libc/sysv/consts/rlim.h" +#include "libc/sysv/consts/rlimit.h" +#include "libc/sysv/consts/rusage.h" // __FreeBSD_version +#if __FreeBSD_version >= 701000 +// MISSING #include +#endif +#endif + +#include "third_party/libcxx/atomic" + +// Does the operating system have a system call to pin a thread to a set of OS processors? +#define __TBB_OS_AFFINITY_SYSCALL_PRESENT ((__linux__ && !__ANDROID__) || (__FreeBSD_version >= 701000)) +// On IBM* Blue Gene* CNK nodes, the affinity API has restrictions that prevent its usability for TBB, +// and also sysconf(_SC_NPROCESSORS_ONLN) already takes process affinity into account. +#define __TBB_USE_OS_AFFINITY_SYSCALL (__TBB_OS_AFFINITY_SYSCALL_PRESENT && !__bg__) + +namespace tbb { +namespace detail { +namespace r1 { + +void runtime_warning(const char* format, ... ); + +#if __TBB_ARENA_BINDING +class task_arena; +class task_scheduler_observer; +#endif /*__TBB_ARENA_BINDING*/ + +const std::size_t MByte = 1024*1024; + +#if __TBB_USE_WINAPI +// The Microsoft Documentation about Thread Stack Size states that +// "The default stack reservation size used by the linker is 1 MB" +const std::size_t ThreadStackSize = 1*MByte; +#else +const std::size_t ThreadStackSize = (sizeof(uintptr_t) <= 4 ? 2 : 4 )*MByte; +#endif + +#ifndef __TBB_HardwareConcurrency + +//! Returns maximal parallelism level supported by the current OS configuration. +int AvailableHwConcurrency(); + +#else + +inline int AvailableHwConcurrency() { + int n = __TBB_HardwareConcurrency(); + return n > 0 ? n : 1; // Fail safety strap +} +#endif /* __TBB_HardwareConcurrency */ + +//! Returns OS regular memory page size +size_t DefaultSystemPageSize(); + +//! Returns number of processor groups in the current OS configuration. +/** AvailableHwConcurrency must be called at least once before calling this method. **/ +int NumberOfProcessorGroups(); + +#if _WIN32||_WIN64 + +//! Retrieves index of processor group containing processor with the given index +int FindProcessorGroupIndex ( int processorIndex ); + +//! Affinitizes the thread to the specified processor group +void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ); + +#endif /* _WIN32||_WIN64 */ + +//! Prints TBB version information on stderr +void PrintVersion(); + +//! Prints arbitrary extra TBB version information on stderr +void PrintExtraVersionInfo( const char* category, const char* format, ... ); + +//! A callback routine to print RML version information on stderr +void PrintRMLVersionInfo( void* arg, const char* server_info ); + +// For TBB compilation only; not to be used in public headers +#if defined(min) || defined(max) +#undef min +#undef max +#endif + +//! Utility template function returning lesser of the two values. +/** Provided here to avoid including not strict safe .\n + In case operands cause signed/unsigned or size mismatch warnings it is caller's + responsibility to do the appropriate cast before calling the function. **/ +template +T min ( const T& val1, const T& val2 ) { + return val1 < val2 ? val1 : val2; +} + +//! Utility template function returning greater of the two values. +/** Provided here to avoid including not strict safe .\n + In case operands cause signed/unsigned or size mismatch warnings it is caller's + responsibility to do the appropriate cast before calling the function. **/ +template +T max ( const T& val1, const T& val2 ) { + return val1 < val2 ? val2 : val1; +} + +//! Utility helper structure to ease overload resolution +template struct int_to_type {}; + +//------------------------------------------------------------------------ +// FastRandom +//------------------------------------------------------------------------ + +//! A fast random number generator. +/** Uses linear congruential method. */ +class FastRandom { +private: + unsigned x, c; + static const unsigned a = 0x9e3779b1; // a big prime number +public: + //! Get a random number. + unsigned short get() { + return get(x); + } + //! Get a random number for the given seed; update the seed for next use. + unsigned short get( unsigned& seed ) { + unsigned short r = (unsigned short)(seed>>16); + __TBB_ASSERT(c&1, "c must be odd for big rng period"); + seed = seed*a+c; + return r; + } + //! Construct a random number generator. + FastRandom( void* unique_ptr ) { init(uintptr_t(unique_ptr)); } + + template + void init( T seed ) { + init(seed,int_to_type()); + } + void init( uint64_t seed , int_to_type<8> ) { + init(uint32_t((seed>>32)+seed), int_to_type<4>()); + } + void init( uint32_t seed, int_to_type<4> ) { + // threads use different seeds for unique sequences + c = (seed|1)*0xba5703f5; // c must be odd, shuffle by a prime number + x = c^(seed>>1); // also shuffle x for the first get() invocation + } +}; + +//------------------------------------------------------------------------ +// Atomic extensions +//------------------------------------------------------------------------ + +//! Atomically replaces value of dst with newValue if they satisfy condition of compare predicate +/** Return value semantics is the same as for CAS. **/ +template +T1 atomic_update(std::atomic& dst, T1 newValue, Pred compare) { + T1 oldValue = dst.load(std::memory_order_acquire); + while ( compare(oldValue, newValue) ) { + if ( dst.compare_exchange_strong(oldValue, newValue) ) + break; + } + return oldValue; +} + +#if __TBB_USE_OS_AFFINITY_SYSCALL + #if __linux__ + typedef cpu_set_t basic_mask_t; + #elif __FreeBSD_version >= 701000 + typedef cpuset_t basic_mask_t; + #else + #error affinity_helper is not implemented in this OS + #endif + class affinity_helper : no_copy { + basic_mask_t* threadMask; + int is_changed; + public: + affinity_helper() : threadMask(nullptr), is_changed(0) {} + ~affinity_helper(); + void protect_affinity_mask( bool restore_process_mask ); + void dismiss(); + }; + void destroy_process_mask(); +#else + class affinity_helper : no_copy { + public: + void protect_affinity_mask( bool ) {} + }; + inline void destroy_process_mask(){} +#endif /* __TBB_USE_OS_AFFINITY_SYSCALL */ + +struct cpu_features_type { + bool rtm_enabled{false}; + bool waitpkg_enabled{false}; +}; + +void detect_cpu_features(cpu_features_type& cpu_features); + +#if __TBB_ARENA_BINDING +class binding_handler; + +binding_handler* construct_binding_handler(int slot_num, int numa_id, int core_type_id, int max_threads_per_core); +void destroy_binding_handler(binding_handler* handler_ptr); +void apply_affinity_mask(binding_handler* handler_ptr, int slot_num); +void restore_affinity_mask(binding_handler* handler_ptr, int slot_num); + +#endif /*__TBB_ARENA_BINDING*/ + +// RTM specific section +// abort code for mutexes that detect a conflict with another thread. +enum { + speculation_not_supported = 0x00, + speculation_transaction_aborted = 0x01, + speculation_can_retry = 0x02, + speculation_memadd_conflict = 0x04, + speculation_buffer_overflow = 0x08, + speculation_breakpoint_hit = 0x10, + speculation_nested_abort = 0x20, + speculation_xabort_mask = 0xFF000000, + speculation_xabort_shift = 24, + speculation_xabort_not_free = 0xFF, // The value (0xFF) below comes from the Intel(R) 64 and IA-32 Architectures Optimization Reference Manual 12.4.5 lock not free + speculation_successful_begin = 0xFFFFFFFF, + speculation_retry = speculation_transaction_aborted + | speculation_can_retry + | speculation_memadd_conflict +}; + +// We suppose that successful transactions are sequentially ordered and +// do not require additional memory fences around them. +// Technically it can be achieved only if xbegin has implicit +// acquire memory semantics an xend/xabort has release memory semantics on compiler and hardware level. +// See the article: https://arxiv.org/pdf/1710.04839.pdf +static inline unsigned int begin_transaction() { +#if __TBB_TSX_INTRINSICS_PRESENT + return _xbegin(); +#else + return speculation_not_supported; // return unsuccessful code +#endif +} + +static inline void end_transaction() { +#if __TBB_TSX_INTRINSICS_PRESENT + _xend(); +#endif +} + +static inline void abort_transaction() { +#if __TBB_TSX_INTRINSICS_PRESENT + _xabort(speculation_xabort_not_free); +#endif +} + +#if TBB_USE_ASSERT +static inline unsigned char is_in_transaction() { +#if __TBB_TSX_INTRINSICS_PRESENT + return _xtest(); +#else + return 0; +#endif +} +#endif // TBB_USE_ASSERT + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* _TBB_tbb_misc_H */ diff --git a/third_party/tbb/misc_ex.cpp b/third_party/tbb/misc_ex.cpp new file mode 100644 index 000000000..feb0ad05f --- /dev/null +++ b/third_party/tbb/misc_ex.cpp @@ -0,0 +1,457 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Source file for miscellaneous entities that are infrequently referenced by +// an executing program, and implementation of which requires dynamic linking. + +#include "third_party/tbb/misc.h" + +#if !defined(__TBB_HardwareConcurrency) + +#include "third_party/tbb/dynamic_link.h" +#include "libc/calls/calls.h" +#include "libc/calls/weirdtypes.h" +#include "libc/fmt/fmt.h" +#include "libc/stdio/dprintf.h" +#include "libc/stdio/stdio.h" +#include "libc/stdio/temp.h" +#include "third_party/musl/tempnam.h" +#include "libc/limits.h" +#include "libc/sysv/consts/_posix.h" +#include "libc/sysv/consts/iov.h" +#include "libc/sysv/consts/limits.h" +#include "libc/sysv/consts/xopen.h" +#include "libc/thread/thread.h" + +#if _WIN32||_WIN64 +#include "libc/nt/accounting.h" +#include "libc/nt/automation.h" +#include "libc/nt/console.h" +#include "libc/nt/debug.h" +#include "libc/nt/dll.h" +#include "libc/nt/enum/keyaccess.h" +#include "libc/nt/enum/regtype.h" +#include "libc/nt/errors.h" +#include "libc/nt/events.h" +#include "libc/nt/files.h" +#include "libc/nt/ipc.h" +#include "libc/nt/memory.h" +#include "libc/nt/paint.h" +#include "libc/nt/process.h" +#include "libc/nt/registry.h" +#include "libc/nt/synchronization.h" +#include "libc/nt/thread.h" +#include "libc/nt/windows.h" +#include "libc/nt/winsock.h" +#if __TBB_WIN8UI_SUPPORT +#include "third_party/libcxx/thread" +#endif +#else +#include "libc/calls/calls.h" +#include "libc/calls/weirdtypes.h" +#include "libc/runtime/pathconf.h" +#include "libc/runtime/runtime.h" +#include "libc/runtime/sysconf.h" +#include "libc/sysv/consts/f.h" +#include "libc/sysv/consts/fileno.h" +#include "libc/sysv/consts/o.h" +#include "libc/sysv/consts/ok.h" +#include "libc/time/time.h" +#include "third_party/getopt/getopt.h" +#include "third_party/musl/crypt.h" +#include "third_party/musl/lockf.h" +#if __unix__ +#if __linux__ +#include "libc/calls/calls.h" +#include "libc/calls/struct/sysinfo.h" +#endif +#include "third_party/libcxx/cstring" +#include "libc/calls/calls.h" +#include "libc/calls/struct/cpuset.h" +#include "libc/calls/struct/sched_param.h" +#include "libc/calls/weirdtypes.h" +#include "libc/sysv/consts/sched.h" +#include "third_party/libcxx/cerrno" +#elif __sun +#include "libc/calls/calls.h" +#include "libc/calls/struct/sysinfo.h" +#elif __FreeBSD__ +#include "third_party/libcxx/cerrno" +#include "third_party/libcxx/cstring" +#include "libc/intrin/newbie.h" +#include "libc/calls/calls.h" +#include "libc/calls/struct/rlimit.h" +#include "libc/calls/struct/rusage.h" +#include "libc/calls/sysparam.h" +#include "libc/calls/weirdtypes.h" +#include "libc/limits.h" +#include "libc/sysv/consts/endian.h" +#include "libc/sysv/consts/prio.h" +#include "libc/sysv/consts/rlim.h" +#include "libc/sysv/consts/rlimit.h" +#include "libc/sysv/consts/rusage.h" // Required by +// MISSING #include +#endif +#endif + +namespace tbb { +namespace detail { +namespace r1 { + +#if __TBB_USE_OS_AFFINITY_SYSCALL + +#if __unix__ +// Handlers for interoperation with libiomp +static int (*libiomp_try_restoring_original_mask)(); +// Table for mapping to libiomp entry points +static const dynamic_link_descriptor iompLinkTable[] = { + DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask ) +}; +#endif + +static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) { +#if __FreeBSD__ || __NetBSD__ || __OpenBSD__ + if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) +#else /* __unix__ */ + if( sched_setaffinity( 0, maskSize, threadMask ) ) +#endif + // Here and below the error severity is lowered from critical level + // because it may happen during TBB library unload because of not + // waiting for workers to complete (current RML policy, to be fixed). + // handle_perror( errno, "setaffinity syscall" ); + runtime_warning( "setaffinity syscall failed" ); +} + +static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) { +#if __FreeBSD__ || __NetBSD__ || __OpenBSD__ + if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) +#else /* __unix__ */ + if( sched_getaffinity( 0, maskSize, threadMask ) ) +#endif + runtime_warning( "getaffinity syscall failed" ); +} + +static basic_mask_t* process_mask; +static int num_masks; + +void destroy_process_mask() { + delete [] process_mask; + process_mask = nullptr; +} + +#define curMaskSize sizeof(basic_mask_t) * num_masks +affinity_helper::~affinity_helper() { + if( threadMask ) { + if( is_changed ) { + set_thread_affinity_mask( curMaskSize, threadMask ); + } + delete [] threadMask; + } +} +void affinity_helper::protect_affinity_mask( bool restore_process_mask ) { + if( threadMask == nullptr && num_masks ) { // TODO: assert num_masks validity? + threadMask = new basic_mask_t [num_masks]; + std::memset( threadMask, 0, curMaskSize ); + get_thread_affinity_mask( curMaskSize, threadMask ); + if( restore_process_mask ) { + __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" ); + is_changed = memcmp( process_mask, threadMask, curMaskSize ); + if( is_changed ) + set_thread_affinity_mask( curMaskSize, process_mask ); + } else { + // Assume that the mask will be changed by the caller. + is_changed = 1; + } + } +} +void affinity_helper::dismiss() { + delete [] threadMask; + threadMask = nullptr; + is_changed = 0; +} +#undef curMaskSize + +static std::atomic hardware_concurrency_info; + +static int theNumProcs; + +static void initialize_hardware_concurrency_info () { + int err; + int availableProcs = 0; + int numMasks = 1; + int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); + basic_mask_t* processMask; + const std::size_t BasicMaskSize = sizeof(basic_mask_t); + for (;;) { + const int curMaskSize = BasicMaskSize * numMasks; + processMask = new basic_mask_t[numMasks]; + std::memset( processMask, 0, curMaskSize ); +#if __FreeBSD__ || __NetBSD__ || __OpenBSD__ + // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask + err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask ); + if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 ) + break; +#else /* __unix__ */ + int pid = getpid(); + err = sched_getaffinity( pid, curMaskSize, processMask ); + if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 ) + break; +#endif + delete[] processMask; + numMasks <<= 1; + } + if ( !err ) { + // We have found the mask size and captured the process affinity mask into processMask. + num_masks = numMasks; // do here because it's needed for affinity_helper to work +#if __unix__ + // For better coexistence with libiomp which might have changed the mask already, + // check for its presence and ask it to restore the mask. + dynamic_link_handle libhandle; + if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) { + // We have found the symbol provided by libiomp5 for restoring original thread affinity. + affinity_helper affhelp; + affhelp.protect_affinity_mask( /*restore_process_mask=*/false ); + if ( libiomp_try_restoring_original_mask()==0 ) { + // Now we have the right mask to capture, restored by libiomp. + const int curMaskSize = BasicMaskSize * numMasks; + std::memset( processMask, 0, curMaskSize ); + get_thread_affinity_mask( curMaskSize, processMask ); + } else + affhelp.dismiss(); // thread mask has not changed + dynamic_unlink( libhandle ); + // Destructor of affinity_helper restores the thread mask (unless dismissed). + } +#endif + for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) { + for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) { + if ( CPU_ISSET( i, processMask + m ) ) + ++availableProcs; + } + } + process_mask = processMask; + } + else { + // Failed to get the process affinity mask; assume the whole machine can be used. + availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs; + delete[] processMask; + } + theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap + __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), nullptr); +} + +int AvailableHwConcurrency() { + atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); + return theNumProcs; +} + +/* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */ +#elif __ANDROID__ + +// Work-around for Android that reads the correct number of available CPUs since system calls are unreliable. +// Format of "present" file is: ([-|],)+ +int AvailableHwConcurrency() { + FILE *fp = fopen("/sys/devices/system/cpu/present", "r"); + if (fp == nullptr) return 1; + int num_args, lower, upper, num_cpus=0; + while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) { + switch(num_args) { + case 2: num_cpus += upper - lower + 1; break; + case 1: num_cpus += 1; break; + } + fscanf(fp, ","); + } + return (num_cpus > 0) ? num_cpus : 1; +} + +#elif defined(_SC_NPROCESSORS_ONLN) + +int AvailableHwConcurrency() { + int n = sysconf(_SC_NPROCESSORS_ONLN); + return (n > 0) ? n : 1; +} + +#elif _WIN32||_WIN64 + +static std::atomic hardware_concurrency_info; + +static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff; + +// Statically allocate an array for processor group information. +// Windows 7 supports maximum 4 groups, but let's look ahead a little. +static const WORD MaxProcessorGroups = 64; + +struct ProcessorGroupInfo { + DWORD_PTR mask; ///< Affinity mask covering the whole group + int numProcs; ///< Number of processors in the group + int numProcsRunningTotal; ///< Subtotal of processors in this and preceding groups + + //! Total number of processor groups in the system + static int NumGroups; + + //! Index of the group with a slot reserved for the first external thread + /** In the context of multiple processor groups support current implementation + defines "the first external thread" as the first thread to invoke + AvailableHwConcurrency(). + + TODO: Implement a dynamic scheme remapping workers depending on the pending + external threads affinity. **/ + static int HoleIndex; +}; + +int ProcessorGroupInfo::NumGroups = 1; +int ProcessorGroupInfo::HoleIndex = 0; + +ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups]; +int calculate_numa[MaxProcessorGroups]; //Array needed for FindProcessorGroupIndex to calculate Processor Group when number of threads > number of cores to distribute threads evenly between processor groups +int numaSum; +struct TBB_GROUP_AFFINITY { + DWORD_PTR Mask; + WORD Group; + WORD Reserved[3]; +}; + +static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = nullptr; +static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = nullptr; +static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread, + const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff ); +static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* ); + +static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = { + DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount) + , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount) + , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity) + , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity) +}; + +static void initialize_hardware_concurrency_info () { + suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS); +#if __TBB_WIN8UI_SUPPORT + // For these applications processor groups info is unavailable + // Setting up a number of processors for one processor group + theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency(); +#else /* __TBB_WIN8UI_SUPPORT */ + dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable, + sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) ); + SYSTEM_INFO si; + GetNativeSystemInfo(&si); + DWORD_PTR pam, sam, m = 1; + GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam ); + int nproc = 0; + for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) { + if ( pam & m ) + ++nproc; + } + __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, nullptr); + // By default setting up a number of processors for one processor group + theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc; + // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present + if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) { + // The process does not have restricting affinity mask and multiple processor groups are possible + ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount(); + __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, nullptr); + // Fail safety bootstrap. Release versions will limit available concurrency + // level, while debug ones would assert. + if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups ) + ProcessorGroupInfo::NumGroups = MaxProcessorGroups; + if ( ProcessorGroupInfo::NumGroups > 1 ) { + TBB_GROUP_AFFINITY ga; + if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) ) + ProcessorGroupInfo::HoleIndex = ga.Group; + int nprocs = 0; + int min_procs = INT_MAX; + for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) { + ProcessorGroupInfo &pgi = theProcessorGroups[i]; + pgi.numProcs = (int)TBB_GetActiveProcessorCount(i); + if (pgi.numProcs < min_procs) min_procs = pgi.numProcs; //Finding the minimum number of processors in the Processor Groups + calculate_numa[i] = pgi.numProcs; + __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, nullptr); + pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1; + pgi.numProcsRunningTotal = nprocs += pgi.numProcs; + } + __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), nullptr); + + calculate_numa[0] = (calculate_numa[0] / min_procs)-1; + for (WORD i = 1; i < ProcessorGroupInfo::NumGroups; ++i) { + calculate_numa[i] = calculate_numa[i-1] + (calculate_numa[i] / min_procs); + } + + numaSum = calculate_numa[ProcessorGroupInfo::NumGroups - 1]; + + } + + } +#endif /* __TBB_WIN8UI_SUPPORT */ + + PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups); + if (ProcessorGroupInfo::NumGroups>1) + for (int i=0; i= theProcessorGroups[current_grp_idx].numProcs && procIdx < theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) { + procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs; + do { + current_grp_idx = (current_grp_idx + 1) % (ProcessorGroupInfo::NumGroups); + procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs; + + } while (procIdx >= 0); + } + else if (procIdx >= theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) { + int temp_grp_index = 0; + procIdx = procIdx - theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; + procIdx = procIdx % (numaSum+1); //ProcIdx to stay between 0 and numaSum + + while (procIdx - calculate_numa[temp_grp_index] > 0) { + temp_grp_index = (temp_grp_index + 1) % ProcessorGroupInfo::NumGroups; + } + current_grp_idx = temp_grp_index; + } + __TBB_ASSERT(current_grp_idx < ProcessorGroupInfo::NumGroups, nullptr); + + return current_grp_idx; +} + +void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) { + __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" ); + if ( !TBB_SetThreadGroupAffinity ) + return; + TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} }; + TBB_SetThreadGroupAffinity( hThread, &ga, nullptr); +} + +int AvailableHwConcurrency() { + atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); + return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; +} + +/* End of _WIN32||_WIN64 implementation */ +#else + #error AvailableHwConcurrency is not implemented for this OS +#endif + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* !__TBB_HardwareConcurrency */ diff --git a/third_party/tbb/mutex.h b/third_party/tbb/mutex.h new file mode 100644 index 000000000..791ba7798 --- /dev/null +++ b/third_party/tbb/mutex.h @@ -0,0 +1,94 @@ +// clang-format off +/* + Copyright (c) 2021-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_mutex_H +#define __TBB_mutex_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_scoped_lock.h" +#include "third_party/tbb/detail/_waitable_atomic.h" +#include "third_party/tbb/detail/_mutex_common.h" +#include "third_party/tbb/profiling.h" + +namespace tbb { +namespace detail { +namespace d1 { + +class mutex { +public: + //! Constructors + mutex() { + create_itt_sync(this, "tbb::mutex", ""); + }; + + //! Destructor + ~mutex() = default; + + //! No Copy + mutex(const mutex&) = delete; + mutex& operator=(const mutex&) = delete; + + using scoped_lock = unique_scoped_lock; + + //! Mutex traits + static constexpr bool is_rw_mutex = false; + static constexpr bool is_recursive_mutex = false; + static constexpr bool is_fair_mutex = false; + + //! Acquire lock + /** Spin if the lock is taken */ + void lock() { + call_itt_notify(prepare, this); + while (!try_lock()) { + my_flag.wait(true, /* context = */ 0, std::memory_order_relaxed); + } + } + + //! Try acquiring lock (non-blocking) + /** Return true if lock acquired; false otherwise. */ + bool try_lock() { + bool result = !my_flag.load(std::memory_order_relaxed) && !my_flag.exchange(true); + if (result) { + call_itt_notify(acquired, this); + } + return result; + } + + //! Release lock + void unlock() { + call_itt_notify(releasing, this); + // We need Write Read memory barrier before notify that reads the waiter list. + // In C++ only full fence covers this type of barrier. + my_flag.exchange(false); + my_flag.notify_one_relaxed(); + } + +private: + waitable_atomic my_flag{0}; +}; // class mutex + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::mutex; +} // namespace v1 + +} // namespace tbb + +#endif // __TBB_mutex_H diff --git a/third_party/tbb/null_mutex.h b/third_party/tbb/null_mutex.h new file mode 100644 index 000000000..d0c3cfb99 --- /dev/null +++ b/third_party/tbb/null_mutex.h @@ -0,0 +1,81 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_null_mutex_H +#define __TBB_null_mutex_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_mutex_common.h" + +namespace tbb { +namespace detail { +namespace d1 { + +//! A mutex which does nothing +/** A null_mutex does no operation and simulates success. + @ingroup synchronization */ +class null_mutex { +public: + //! Constructors + constexpr null_mutex() noexcept = default; + + //! Destructor + ~null_mutex() = default; + + //! No Copy + null_mutex(const null_mutex&) = delete; + null_mutex& operator=(const null_mutex&) = delete; + + //! Represents acquisition of a mutex. + class scoped_lock { + public: + //! Constructors + constexpr scoped_lock() noexcept = default; + scoped_lock(null_mutex&) {} + + //! Destructor + ~scoped_lock() = default; + + //! No Copy + scoped_lock(const scoped_lock&) = delete; + scoped_lock& operator=(const scoped_lock&) = delete; + + void acquire(null_mutex&) {} + bool try_acquire(null_mutex&) { return true; } + void release() {} + }; + + //! Mutex traits + static constexpr bool is_rw_mutex = false; + static constexpr bool is_recursive_mutex = true; + static constexpr bool is_fair_mutex = true; + + void lock() {} + bool try_lock() { return true; } + void unlock() {} +}; // class null_mutex + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::null_mutex; +} // namespace v1 +} // namespace tbb + +#endif /* __TBB_null_mutex_H */ diff --git a/third_party/tbb/null_rw_mutex.h b/third_party/tbb/null_rw_mutex.h new file mode 100644 index 000000000..d8dff0488 --- /dev/null +++ b/third_party/tbb/null_rw_mutex.h @@ -0,0 +1,88 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_null_rw_mutex_H +#define __TBB_null_rw_mutex_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_mutex_common.h" + +namespace tbb { +namespace detail { +namespace d1 { + +//! A rw mutex which does nothing +/** A null_rw_mutex is a rw mutex that does nothing and simulates successful operation. + @ingroup synchronization */ +class null_rw_mutex { +public: + //! Constructors + constexpr null_rw_mutex() noexcept = default; + + //! Destructor + ~null_rw_mutex() = default; + + //! No Copy + null_rw_mutex(const null_rw_mutex&) = delete; + null_rw_mutex& operator=(const null_rw_mutex&) = delete; + + //! Represents acquisition of a mutex. + class scoped_lock { + public: + //! Constructors + constexpr scoped_lock() noexcept = default; + scoped_lock(null_rw_mutex&, bool = true) {} + + //! Destructor + ~scoped_lock() = default; + + //! No Copy + scoped_lock(const scoped_lock&) = delete; + scoped_lock& operator=(const scoped_lock&) = delete; + + void acquire(null_rw_mutex&, bool = true) {} + bool try_acquire(null_rw_mutex&, bool = true) { return true; } + void release() {} + bool upgrade_to_writer() { return true; } + bool downgrade_to_reader() { return true; } + + bool is_writer() const { return true; } + }; + + //! Mutex traits + static constexpr bool is_rw_mutex = true; + static constexpr bool is_recursive_mutex = true; + static constexpr bool is_fair_mutex = true; + + void lock() {} + bool try_lock() { return true; } + void unlock() {} + void lock_shared() {} + bool try_lock_shared() { return true; } + void unlock_shared() {} +}; // class null_rw_mutex + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::null_rw_mutex; +} // namespace v1 +} // namespace tbb + +#endif /* __TBB_null_rw_mutex_H */ diff --git a/third_party/tbb/observer_proxy.cpp b/third_party/tbb/observer_proxy.cpp new file mode 100644 index 000000000..463f57809 --- /dev/null +++ b/third_party/tbb/observer_proxy.cpp @@ -0,0 +1,320 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_utils.h" + +#include "third_party/tbb/observer_proxy.h" +#include "third_party/tbb/arena.h" +#include "third_party/tbb/main.h" +#include "third_party/tbb/thread_data.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace r1 { + +#if TBB_USE_ASSERT +extern std::atomic the_observer_proxy_count; +#endif /* TBB_USE_ASSERT */ + +observer_proxy::observer_proxy( d1::task_scheduler_observer& tso ) + : my_ref_count(1), my_list(nullptr), my_next(nullptr), my_prev(nullptr), my_observer(&tso) +{ +#if TBB_USE_ASSERT + ++the_observer_proxy_count; +#endif /* TBB_USE_ASSERT */ +} + +observer_proxy::~observer_proxy() { + __TBB_ASSERT( !my_ref_count, "Attempt to destroy proxy still in use" ); + poison_value(my_ref_count); + poison_pointer(my_prev); + poison_pointer(my_next); +#if TBB_USE_ASSERT + --the_observer_proxy_count; +#endif /* TBB_USE_ASSERT */ +} + +void observer_list::clear() { + { + scoped_lock lock(mutex(), /*is_writer=*/true); + observer_proxy *next = my_head.load(std::memory_order_relaxed); + while ( observer_proxy *p = next ) { + next = p->my_next; + // Both proxy p and observer p->my_observer (if non-null) are guaranteed + // to be alive while the list is locked. + d1::task_scheduler_observer *obs = p->my_observer; + // Make sure that possible concurrent observer destruction does not + // conflict with the proxy list cleanup. + if (!obs || !(p = obs->my_proxy.exchange(nullptr))) { + continue; + } + // accessing 'obs' after detaching of obs->my_proxy leads to the race with observer destruction + __TBB_ASSERT(!next || p == next->my_prev, nullptr); + __TBB_ASSERT(is_alive(p->my_ref_count), "Observer's proxy died prematurely"); + __TBB_ASSERT(p->my_ref_count.load(std::memory_order_relaxed) == 1, "Reference for observer is missing"); + poison_pointer(p->my_observer); + remove(p); + --p->my_ref_count; + delete p; + } + } + + // If observe(false) is called concurrently with the destruction of the arena, + // need to wait until all proxies are removed. + for (atomic_backoff backoff; ; backoff.pause()) { + scoped_lock lock(mutex(), /*is_writer=*/false); + if (my_head.load(std::memory_order_relaxed) == nullptr) { + break; + } + } + + __TBB_ASSERT(my_head.load(std::memory_order_relaxed) == nullptr && my_tail.load(std::memory_order_relaxed) == nullptr, nullptr); +} + +void observer_list::insert( observer_proxy* p ) { + scoped_lock lock(mutex(), /*is_writer=*/true); + if (my_head.load(std::memory_order_relaxed)) { + p->my_prev = my_tail.load(std::memory_order_relaxed); + my_tail.load(std::memory_order_relaxed)->my_next = p; + } else { + my_head.store(p, std::memory_order_relaxed); + } + my_tail.store(p, std::memory_order_relaxed); +} + +void observer_list::remove(observer_proxy* p) { + __TBB_ASSERT(my_head.load(std::memory_order_relaxed), "Attempt to remove an item from an empty list"); + __TBB_ASSERT(!my_tail.load(std::memory_order_relaxed)->my_next, "Last item's my_next must be nullptr"); + if (p == my_tail.load(std::memory_order_relaxed)) { + __TBB_ASSERT(!p->my_next, nullptr); + my_tail.store(p->my_prev, std::memory_order_relaxed); + } else { + __TBB_ASSERT(p->my_next, nullptr); + p->my_next->my_prev = p->my_prev; + } + if (p == my_head.load(std::memory_order_relaxed)) { + __TBB_ASSERT(!p->my_prev, nullptr); + my_head.store(p->my_next, std::memory_order_relaxed); + } else { + __TBB_ASSERT(p->my_prev, nullptr); + p->my_prev->my_next = p->my_next; + } + __TBB_ASSERT((my_head.load(std::memory_order_relaxed) && my_tail.load(std::memory_order_relaxed)) || + (!my_head.load(std::memory_order_relaxed) && !my_tail.load(std::memory_order_relaxed)), nullptr); +} + +void observer_list::remove_ref(observer_proxy* p) { + std::uintptr_t r = p->my_ref_count.load(std::memory_order_acquire); + __TBB_ASSERT(is_alive(r), nullptr); + while (r > 1) { + if (p->my_ref_count.compare_exchange_strong(r, r - 1)) { + return; + } + } + __TBB_ASSERT(r == 1, nullptr); + // Reference count might go to zero + { + // Use lock to avoid resurrection by a thread concurrently walking the list + observer_list::scoped_lock lock(mutex(), /*is_writer=*/true); + r = --p->my_ref_count; + if (!r) { + remove(p); + } + } + __TBB_ASSERT(r || !p->my_ref_count, nullptr); + if (!r) { + delete p; + } +} + +void observer_list::do_notify_entry_observers(observer_proxy*& last, bool worker) { + // Pointer p marches though the list from last (exclusively) to the end. + observer_proxy* p = last, * prev = p; + for (;;) { + d1::task_scheduler_observer* tso = nullptr; + // Hold lock on list only long enough to advance to the next proxy in the list. + { + scoped_lock lock(mutex(), /*is_writer=*/false); + do { + if (p) { + // We were already processing the list. + if (observer_proxy* q = p->my_next) { + if (p == prev) { + remove_ref_fast(prev); // sets prev to nullptr if successful + } + p = q; + } else { + // Reached the end of the list. + if (p == prev) { + // Keep the reference as we store the 'last' pointer in scheduler + __TBB_ASSERT(int(p->my_ref_count.load(std::memory_order_relaxed)) >= 1 + (p->my_observer ? 1 : 0), nullptr); + } else { + // The last few proxies were empty + __TBB_ASSERT(int(p->my_ref_count.load(std::memory_order_relaxed)), nullptr); + ++p->my_ref_count; + if (prev) { + lock.release(); + remove_ref(prev); + } + } + last = p; + return; + } + } else { + // Starting pass through the list + p = my_head.load(std::memory_order_relaxed); + if (!p) { + return; + } + } + tso = p->my_observer; + } while (!tso); + ++p->my_ref_count; + ++tso->my_busy_count; + } + __TBB_ASSERT(!prev || p != prev, nullptr); + // Release the proxy pinned before p + if (prev) { + remove_ref(prev); + } + // Do not hold any locks on the list while calling user's code. + // Do not intercept any exceptions that may escape the callback so that + // they are either handled by the TBB scheduler or passed to the debugger. + tso->on_scheduler_entry(worker); + __TBB_ASSERT(p->my_ref_count.load(std::memory_order_relaxed), nullptr); + intptr_t bc = --tso->my_busy_count; + __TBB_ASSERT_EX(bc >= 0, "my_busy_count underflowed"); + prev = p; + } +} + +void observer_list::do_notify_exit_observers(observer_proxy* last, bool worker) { + // Pointer p marches though the list from the beginning to last (inclusively). + observer_proxy* p = nullptr, * prev = nullptr; + for (;;) { + d1::task_scheduler_observer* tso = nullptr; + // Hold lock on list only long enough to advance to the next proxy in the list. + { + scoped_lock lock(mutex(), /*is_writer=*/false); + do { + if (p) { + // We were already processing the list. + if (p != last) { + __TBB_ASSERT(p->my_next, "List items before 'last' must have valid my_next pointer"); + if (p == prev) + remove_ref_fast(prev); // sets prev to nullptr if successful + p = p->my_next; + } else { + // remove the reference from the last item + remove_ref_fast(p); + if (p) { + lock.release(); + if (p != prev && prev) { + remove_ref(prev); + } + remove_ref(p); + } + return; + } + } else { + // Starting pass through the list + p = my_head.load(std::memory_order_relaxed); + __TBB_ASSERT(p, "Nonzero 'last' must guarantee that the global list is non-empty"); + } + tso = p->my_observer; + } while (!tso); + // The item is already refcounted + if (p != last) // the last is already referenced since entry notification + ++p->my_ref_count; + ++tso->my_busy_count; + } + __TBB_ASSERT(!prev || p != prev, nullptr); + if (prev) + remove_ref(prev); + // Do not hold any locks on the list while calling user's code. + // Do not intercept any exceptions that may escape the callback so that + // they are either handled by the TBB scheduler or passed to the debugger. + tso->on_scheduler_exit(worker); + __TBB_ASSERT(p->my_ref_count || p == last, nullptr); + intptr_t bc = --tso->my_busy_count; + __TBB_ASSERT_EX(bc >= 0, "my_busy_count underflowed"); + prev = p; + } +} + +void __TBB_EXPORTED_FUNC observe(d1::task_scheduler_observer &tso, bool enable) { + if( enable ) { + if( !tso.my_proxy.load(std::memory_order_relaxed) ) { + observer_proxy* p = new observer_proxy(tso); + tso.my_proxy.store(p, std::memory_order_relaxed); + tso.my_busy_count.store(0, std::memory_order_relaxed); + + thread_data* td = governor::get_thread_data_if_initialized(); + if (p->my_observer->my_task_arena == nullptr) { + if (!(td && td->my_arena)) { + td = governor::get_thread_data(); + } + __TBB_ASSERT(__TBB_InitOnce::initialization_done(), nullptr); + __TBB_ASSERT(td && td->my_arena, nullptr); + p->my_list = &td->my_arena->my_observers; + } else { + d1::task_arena* ta = p->my_observer->my_task_arena; + arena* a = ta->my_arena.load(std::memory_order_acquire); + if (a == nullptr) { // Avoid recursion during arena initialization + ta->initialize(); + a = ta->my_arena.load(std::memory_order_relaxed); + } + __TBB_ASSERT(a != nullptr, nullptr); + p->my_list = &a->my_observers; + } + p->my_list->insert(p); + // Notify newly activated observer and other pending ones if it belongs to current arena + if (td && td->my_arena && &td->my_arena->my_observers == p->my_list) { + p->my_list->notify_entry_observers(td->my_last_observer, td->my_is_worker); + } + } + } else { + // Make sure that possible concurrent proxy list cleanup does not conflict + // with the observer destruction here. + if ( observer_proxy* proxy = tso.my_proxy.exchange(nullptr) ) { + // List destruction should not touch this proxy after we've won the above interlocked exchange. + __TBB_ASSERT( proxy->my_observer == &tso, nullptr); + __TBB_ASSERT( is_alive(proxy->my_ref_count.load(std::memory_order_relaxed)), "Observer's proxy died prematurely" ); + __TBB_ASSERT( proxy->my_ref_count.load(std::memory_order_relaxed) >= 1, "reference for observer missing" ); + observer_list &list = *proxy->my_list; + { + // Ensure that none of the list walkers relies on observer pointer validity + observer_list::scoped_lock lock(list.mutex(), /*is_writer=*/true); + proxy->my_observer = nullptr; + // Proxy may still be held by other threads (to track the last notified observer) + if( !--proxy->my_ref_count ) {// nobody can increase it under exclusive lock + list.remove(proxy); + __TBB_ASSERT( !proxy->my_ref_count, nullptr); + delete proxy; + } + } + spin_wait_until_eq(tso.my_busy_count, 0); // other threads are still accessing the callback + } + } +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/observer_proxy.h b/third_party/tbb/observer_proxy.h new file mode 100644 index 000000000..78a692423 --- /dev/null +++ b/third_party/tbb/observer_proxy.h @@ -0,0 +1,153 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_observer_proxy_H +#define __TBB_observer_proxy_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_aligned_space.h" + +#include "third_party/tbb/task_scheduler_observer.h" +#include "third_party/tbb/spin_rw_mutex.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class observer_list { + friend class arena; + + // Mutex is wrapped with aligned_space to shut up warnings when its destructor + // is called while threads are still using it. + typedef aligned_space my_mutex_type; + + //! Pointer to the head of this list. + std::atomic my_head{nullptr}; + + //! Pointer to the tail of this list. + std::atomic my_tail{nullptr}; + + //! Mutex protecting this list. + my_mutex_type my_mutex; + + //! Back-pointer to the arena this list belongs to. + arena* my_arena; + + //! Decrement refcount of the proxy p if there are other outstanding references. + /** In case of success sets p to nullptr. Must be invoked from under the list lock. **/ + inline static void remove_ref_fast( observer_proxy*& p ); + + //! Implements notify_entry_observers functionality. + void do_notify_entry_observers( observer_proxy*& last, bool worker ); + + //! Implements notify_exit_observers functionality. + void do_notify_exit_observers( observer_proxy* last, bool worker ); + +public: + observer_list () = default; + + //! Removes and destroys all observer proxies from the list. + /** Cannot be used concurrently with other methods. **/ + void clear (); + + //! Add observer proxy to the tail of the list. + void insert ( observer_proxy* p ); + + //! Remove observer proxy from the list. + void remove ( observer_proxy* p ); + + //! Decrement refcount of the proxy and destroy it if necessary. + /** When refcount reaches zero removes the proxy from the list and destructs it. **/ + void remove_ref( observer_proxy* p ); + + //! Type of the scoped lock for the reader-writer mutex associated with the list. + typedef spin_rw_mutex::scoped_lock scoped_lock; + + //! Accessor to the reader-writer mutex associated with the list. + spin_rw_mutex& mutex () { return my_mutex.begin()[0]; } + + //! Call entry notifications on observers added after last was notified. + /** Updates last to become the last notified observer proxy (in the global list) + or leaves it to be nullptr. The proxy has its refcount incremented. **/ + inline void notify_entry_observers( observer_proxy*& last, bool worker ); + + //! Call exit notifications on last and observers added before it. + inline void notify_exit_observers( observer_proxy*& last, bool worker ); +}; // class observer_list + +//! Wrapper for an observer object +/** To maintain shared lists of observers the scheduler first wraps each observer + object into a proxy so that a list item remained valid even after the corresponding + proxy object is destroyed by the user code. **/ +class observer_proxy { + friend class d1::task_scheduler_observer; + friend class observer_list; + friend void observe(d1::task_scheduler_observer&, bool); + //! Reference count used for garbage collection. + /** 1 for reference from my task_scheduler_observer. + 1 for each task dispatcher's last observer pointer. + No accounting for neighbors in the shared list. */ + std::atomic my_ref_count; + //! Reference to the list this observer belongs to. + observer_list* my_list; + //! Pointer to next observer in the list specified by my_head. + /** nullptr for the last item in the list. **/ + observer_proxy* my_next; + //! Pointer to the previous observer in the list specified by my_head. + /** For the head of the list points to the last item. **/ + observer_proxy* my_prev; + //! Associated observer + d1::task_scheduler_observer* my_observer; + + //! Constructs proxy for the given observer and adds it to the specified list. + observer_proxy( d1::task_scheduler_observer& ); + + ~observer_proxy(); +}; // class observer_proxy + +void observer_list::remove_ref_fast( observer_proxy*& p ) { + if( p->my_observer ) { + // Can decrement refcount quickly, as it cannot drop to zero while under the lock. + std::uintptr_t r = --p->my_ref_count; + __TBB_ASSERT_EX( r, nullptr); + p = nullptr; + } else { + // Use slow form of refcount decrementing, after the lock is released. + } +} + +void observer_list::notify_entry_observers(observer_proxy*& last, bool worker) { + if (last == my_tail.load(std::memory_order_relaxed)) + return; + do_notify_entry_observers(last, worker); +} + +void observer_list::notify_exit_observers( observer_proxy*& last, bool worker ) { + if (last == nullptr) { + return; + } + __TBB_ASSERT(!is_poisoned(last), nullptr); + do_notify_exit_observers( last, worker ); + __TBB_ASSERT(last != nullptr, nullptr); + poison_pointer(last); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* __TBB_observer_proxy_H */ diff --git a/third_party/tbb/parallel_for.h b/third_party/tbb/parallel_for.h new file mode 100644 index 000000000..de7c4166c --- /dev/null +++ b/third_party/tbb/parallel_for.h @@ -0,0 +1,470 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_parallel_for_H +#define __TBB_parallel_for_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/detail/_small_object_pool.h" +#include "third_party/tbb/profiling.h" + +#include "third_party/tbb/partitioner.h" +#include "third_party/tbb/blocked_range.h" +#include "third_party/tbb/task_group.h" + +#include "third_party/libcxx/cstddef" +#include "third_party/libcxx/new" + +namespace tbb { +namespace detail { +#if __TBB_CPP20_CONCEPTS_PRESENT +inline namespace d0 { + +template +concept parallel_for_body = std::copy_constructible && std::invocable&, Range&>; + +template +concept parallel_for_index = std::constructible_from && + std::copyable && + requires( const std::remove_reference_t& lhs, const std::remove_reference_t& rhs ) { + { lhs < rhs } -> adaptive_same_as; + { lhs - rhs } -> std::convertible_to; + { lhs + (rhs - lhs) } -> std::convertible_to; + }; + +template +concept parallel_for_function = std::invocable&, Index>; + +} // namespace d0 +#endif // __TBB_CPP20_CONCEPTS_PRESENT +namespace d1 { + +//! Task type used in parallel_for +/** @ingroup algorithms */ +template +struct start_for : public task { + Range my_range; + const Body my_body; + node* my_parent; + + typename Partitioner::task_partition_type my_partition; + small_object_allocator my_allocator; + + task* execute(execution_data&) override; + task* cancel(execution_data&) override; + void finalize(const execution_data&); + + //! Constructor for root task. + start_for( const Range& range, const Body& body, Partitioner& partitioner, small_object_allocator& alloc ) : + my_range(range), + my_body(body), + my_parent(nullptr), + my_partition(partitioner), + my_allocator(alloc) {} + //! Splitting constructor used to generate children. + /** parent_ becomes left child. Newly constructed object is right child. */ + start_for( start_for& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) : + my_range(parent_.my_range, get_range_split_object(split_obj)), + my_body(parent_.my_body), + my_parent(nullptr), + my_partition(parent_.my_partition, split_obj), + my_allocator(alloc) {} + //! Construct right child from the given range as response to the demand. + /** parent_ remains left child. Newly constructed object is right child. */ + start_for( start_for& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) : + my_range(r), + my_body(parent_.my_body), + my_parent(nullptr), + my_partition(parent_.my_partition, split()), + my_allocator(alloc) + { + my_partition.align_depth( d ); + } + static void run(const Range& range, const Body& body, Partitioner& partitioner) { + task_group_context context(PARALLEL_FOR); + run(range, body, partitioner, context); + } + + static void run(const Range& range, const Body& body, Partitioner& partitioner, task_group_context& context) { + if ( !range.empty() ) { + small_object_allocator alloc{}; + start_for& for_task = *alloc.new_object(range, body, partitioner, alloc); + + // defer creation of the wait node until task allocation succeeds + wait_node wn; + for_task.my_parent = &wn; + execute_and_wait(for_task, context, wn.m_wait, context); + } + } + //! Run body for range, serves as callback for partitioner + void run_body( Range &r ) { + tbb::detail::invoke(my_body, r); + } + + //! spawn right task, serves as callback for partitioner + void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) { + offer_work_impl(ed, *this, split_obj); + } + + //! spawn right task, serves as callback for partitioner + void offer_work(const Range& r, depth_t d, execution_data& ed) { + offer_work_impl(ed, *this, r, d); + } + +private: + template + void offer_work_impl(execution_data& ed, Args&&... constructor_args) { + // New right child + small_object_allocator alloc{}; + start_for& right_child = *alloc.new_object(ed, std::forward(constructor_args)..., alloc); + + // New root node as a continuation and ref count. Left and right child attach to the new parent. + right_child.my_parent = my_parent = alloc.new_object(ed, my_parent, 2, alloc); + // Spawn the right sibling + right_child.spawn_self(ed); + } + + void spawn_self(execution_data& ed) { + my_partition.spawn_task(*this, *context(ed)); + } +}; + +//! fold the tree and deallocate the task +template +void start_for::finalize(const execution_data& ed) { + // Get the current parent and allocator an object destruction + node* parent = my_parent; + auto allocator = my_allocator; + // Task execution finished - destroy it + this->~start_for(); + // Unwind the tree decrementing the parent`s reference count + + fold_tree(parent, ed); + allocator.deallocate(this, ed); + +} + +//! execute task for parallel_for +template +task* start_for::execute(execution_data& ed) { + if (!is_same_affinity(ed)) { + my_partition.note_affinity(execution_slot(ed)); + } + my_partition.check_being_stolen(*this, ed); + my_partition.execute(*this, my_range, ed); + finalize(ed); + return nullptr; +} + +//! cancel task for parallel_for +template +task* start_for::cancel(execution_data& ed) { + finalize(ed); + return nullptr; +} + +//! Calls the function with values from range [begin, end) with a step provided +template +class parallel_for_body_wrapper : detail::no_assign { + const Function &my_func; + const Index my_begin; + const Index my_step; +public: + parallel_for_body_wrapper( const Function& _func, Index& _begin, Index& _step ) + : my_func(_func), my_begin(_begin), my_step(_step) {} + + void operator()( const blocked_range& r ) const { + // A set of local variables to help the compiler with vectorization of the following loop. + Index b = r.begin(); + Index e = r.end(); + Index ms = my_step; + Index k = my_begin + b*ms; + +#if __INTEL_COMPILER +#pragma ivdep +#if __TBB_ASSERT_ON_VECTORIZATION_FAILURE +#pragma vector always assert +#endif +#endif + for ( Index i = b; i < e; ++i, k += ms ) { + tbb::detail::invoke(my_func, k); + } + } +}; + +// Requirements on Range concept are documented in blocked_range.h + +/** \page parallel_for_body_req Requirements on parallel_for body + Class \c Body implementing the concept of parallel_for body must define: + - \code Body::Body( const Body& ); \endcode Copy constructor + - \code Body::~Body(); \endcode Destructor + - \code void Body::operator()( Range& r ) const; \endcode Function call operator applying the body to range \c r. +**/ + +/** \name parallel_for + See also requirements on \ref range_req "Range" and \ref parallel_for_body_req "parallel_for Body". **/ +//@{ + +//! Parallel iteration over range with default partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_for_body) +void parallel_for( const Range& range, const Body& body ) { + start_for::run(range,body,__TBB_DEFAULT_PARTITIONER()); +} + +//! Parallel iteration over range with simple partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_for_body) +void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner ) { + start_for::run(range,body,partitioner); +} + +//! Parallel iteration over range with auto_partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_for_body) +void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner ) { + start_for::run(range,body,partitioner); +} + +//! Parallel iteration over range with static_partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_for_body) +void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner ) { + start_for::run(range,body,partitioner); +} + +//! Parallel iteration over range with affinity_partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_for_body) +void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner ) { + start_for::run(range,body,partitioner); +} + +//! Parallel iteration over range with default partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_for_body) +void parallel_for( const Range& range, const Body& body, task_group_context& context ) { + start_for::run(range, body, __TBB_DEFAULT_PARTITIONER(), context); +} + +//! Parallel iteration over range with simple partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_for_body) +void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner, task_group_context& context ) { + start_for::run(range, body, partitioner, context); +} + +//! Parallel iteration over range with auto_partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_for_body) +void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner, task_group_context& context ) { + start_for::run(range, body, partitioner, context); +} + +//! Parallel iteration over range with static_partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_for_body) +void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner, task_group_context& context ) { + start_for::run(range, body, partitioner, context); +} + +//! Parallel iteration over range with affinity_partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_for_body) +void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner, task_group_context& context ) { + start_for::run(range,body,partitioner, context); +} + +//! Implementation of parallel iteration over stepped range of integers with explicit step and partitioner +template +void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner) { + if (step <= 0 ) + throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument + else if (first < last) { + // Above "else" avoids "potential divide by zero" warning on some platforms + Index end = Index(last - first - 1ul) / step + Index(1); + blocked_range range(static_cast(0), end); + parallel_for_body_wrapper body(f, first, step); + parallel_for(range, body, partitioner); + } +} + +//! Parallel iteration over a range of integers with a step provided and default partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, Index step, const Function& f) { + parallel_for_impl(first, last, step, f, auto_partitioner()); +} +//! Parallel iteration over a range of integers with a step provided and simple partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner) { + parallel_for_impl(first, last, step, f, partitioner); +} +//! Parallel iteration over a range of integers with a step provided and auto partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner) { + parallel_for_impl(first, last, step, f, partitioner); +} +//! Parallel iteration over a range of integers with a step provided and static partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner) { + parallel_for_impl(first, last, step, f, partitioner); +} +//! Parallel iteration over a range of integers with a step provided and affinity partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner) { + parallel_for_impl(first, last, step, f, partitioner); +} + +//! Parallel iteration over a range of integers with a default step value and default partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, const Function& f) { + parallel_for_impl(first, last, static_cast(1), f, auto_partitioner()); +} +//! Parallel iteration over a range of integers with a default step value and simple partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner) { + parallel_for_impl(first, last, static_cast(1), f, partitioner); +} +//! Parallel iteration over a range of integers with a default step value and auto partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner) { + parallel_for_impl(first, last, static_cast(1), f, partitioner); +} +//! Parallel iteration over a range of integers with a default step value and static partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner) { + parallel_for_impl(first, last, static_cast(1), f, partitioner); +} +//! Parallel iteration over a range of integers with a default step value and affinity partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner) { + parallel_for_impl(first, last, static_cast(1), f, partitioner); +} + +//! Implementation of parallel iteration over stepped range of integers with explicit step, task group context, and partitioner +template +void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner, task_group_context &context) { + if (step <= 0 ) + throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument + else if (first < last) { + // Above "else" avoids "potential divide by zero" warning on some platforms + Index end = (last - first - Index(1)) / step + Index(1); + blocked_range range(static_cast(0), end); + parallel_for_body_wrapper body(f, first, step); + parallel_for(range, body, partitioner, context); + } +} + +//! Parallel iteration over a range of integers with explicit step, task group context, and default partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) { + parallel_for_impl(first, last, step, f, auto_partitioner(), context); +} +//! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner, task_group_context &context) { + parallel_for_impl(first, last, step, f, partitioner, context); +} +//! Parallel iteration over a range of integers with explicit step, task group context, and auto partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner, task_group_context &context) { + parallel_for_impl(first, last, step, f, partitioner, context); +} +//! Parallel iteration over a range of integers with explicit step, task group context, and static partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner, task_group_context &context) { + parallel_for_impl(first, last, step, f, partitioner, context); +} +//! Parallel iteration over a range of integers with explicit step, task group context, and affinity partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner, task_group_context &context) { + parallel_for_impl(first, last, step, f, partitioner, context); +} + +//! Parallel iteration over a range of integers with a default step value, explicit task group context, and default partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, const Function& f, task_group_context &context) { + parallel_for_impl(first, last, static_cast(1), f, auto_partitioner(), context); +} +//! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner, task_group_context &context) { + parallel_for_impl(first, last, static_cast(1), f, partitioner, context); +} +//! Parallel iteration over a range of integers with a default step value, explicit task group context, and auto partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner, task_group_context &context) { + parallel_for_impl(first, last, static_cast(1), f, partitioner, context); +} +//! Parallel iteration over a range of integers with a default step value, explicit task group context, and static partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner, task_group_context &context) { + parallel_for_impl(first, last, static_cast(1), f, partitioner, context); +} +//! Parallel iteration over a range of integers with a default step value, explicit task group context, and affinity_partitioner +template + __TBB_requires(parallel_for_index && parallel_for_function) +void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner, task_group_context &context) { + parallel_for_impl(first, last, static_cast(1), f, partitioner, context); +} +// @} + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::parallel_for; +// Split types +using detail::split; +using detail::proportional_split; +} // namespace v1 + +} // namespace tbb + +#endif /* __TBB_parallel_for_H */ diff --git a/third_party/tbb/parallel_for_each.h b/third_party/tbb/parallel_for_each.h new file mode 100644 index 000000000..a4752c69a --- /dev/null +++ b/third_party/tbb/parallel_for_each.h @@ -0,0 +1,682 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_parallel_for_each_H +#define __TBB_parallel_for_each_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/detail/_aligned_space.h" +#include "third_party/tbb/detail/_small_object_pool.h" +#include "third_party/tbb/detail/_utils.h" + +#include "third_party/tbb/parallel_for.h" +#include "third_party/tbb/task_group.h" // task_group_context + +#include "third_party/libcxx/iterator" +#include "third_party/libcxx/type_traits" + +namespace tbb { +namespace detail { +#if __TBB_CPP20_CONCEPTS_PRESENT +namespace d1 { +template +class feeder; + +} // namespace d1 +inline namespace d0 { + +template +concept parallel_for_each_body = std::invocable&, ItemType&&> || + std::invocable&, ItemType&&, tbb::detail::d1::feeder&>; + +} // namespace d0 +#endif // __TBB_CPP20_CONCEPTS_PRESENT +namespace d2 { +template class feeder_impl; +} // namespace d2 + +namespace d1 { +//! Class the user supplied algorithm body uses to add new tasks +template +class feeder { + feeder() {} + feeder(const feeder&) = delete; + void operator=( const feeder&) = delete; + + virtual ~feeder () {} + virtual void internal_add_copy(const Item& item) = 0; + virtual void internal_add_move(Item&& item) = 0; + + template friend class d2::feeder_impl; +public: + //! Add a work item to a running parallel_for_each. + void add(const Item& item) {internal_add_copy(item);} + void add(Item&& item) {internal_add_move(std::move(item));} +}; + +} // namespace d1 + +namespace d2 { +using namespace tbb::detail::d1; +/** Selects one of the two possible forms of function call member operator. + @ingroup algorithms **/ +template +struct parallel_for_each_operator_selector { +public: + template + static auto call(const Body& body, ItemArg&& item, FeederArg*) + -> decltype(tbb::detail::invoke(body, std::forward(item)), void()) { + #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) + // Suppression of Microsoft non-standard extension warnings + #pragma warning (push) + #pragma warning (disable: 4239) + #endif + + tbb::detail::invoke(body, std::forward(item)); + + #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) + #pragma warning (pop) + #endif + } + + template + static auto call(const Body& body, ItemArg&& item, FeederArg* feeder) + -> decltype(tbb::detail::invoke(body, std::forward(item), *feeder), void()) { + #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) + // Suppression of Microsoft non-standard extension warnings + #pragma warning (push) + #pragma warning (disable: 4239) + #endif + __TBB_ASSERT(feeder, "Feeder was not created but should be"); + + tbb::detail::invoke(body, std::forward(item), *feeder); + + #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) + #pragma warning (pop) + #endif + } +}; + +template +struct feeder_item_task: public task { + using feeder_type = feeder_impl; + + template + feeder_item_task(ItemType&& input_item, feeder_type& feeder, small_object_allocator& alloc) : + item(std::forward(input_item)), + my_feeder(feeder), + my_allocator(alloc) + {} + + void finalize(const execution_data& ed) { + my_feeder.my_wait_context.release(); + my_allocator.delete_object(this, ed); + } + + //! Hack for resolve ambiguity between calls to the body with and without moving the stored copy + //! Executing body with moving the copy should have higher priority + using first_priority = int; + using second_priority = double; + + template + static auto call(const BodyType& call_body, ItemType& call_item, FeederType& call_feeder, first_priority) + -> decltype(parallel_for_each_operator_selector::call(call_body, std::move(call_item), &call_feeder), void()) + { + parallel_for_each_operator_selector::call(call_body, std::move(call_item), &call_feeder); + } + + template + static void call(const BodyType& call_body, ItemType& call_item, FeederType& call_feeder, second_priority) { + parallel_for_each_operator_selector::call(call_body, call_item, &call_feeder); + } + + task* execute(execution_data& ed) override { + call(my_feeder.my_body, item, my_feeder, first_priority{}); + finalize(ed); + return nullptr; + } + + task* cancel(execution_data& ed) override { + finalize(ed); + return nullptr; + } + + Item item; + feeder_type& my_feeder; + small_object_allocator my_allocator; +}; // class feeder_item_task + +/** Implements new task adding procedure. + @ingroup algorithms **/ +template +class feeder_impl : public feeder { + // Avoiding use of copy constructor in a virtual method if the type does not support it + void internal_add_copy_impl(std::true_type, const Item& item) { + using feeder_task = feeder_item_task; + small_object_allocator alloc; + auto task = alloc.new_object(item, *this, alloc); + + my_wait_context.reserve(); + spawn(*task, my_execution_context); + } + + void internal_add_copy_impl(std::false_type, const Item&) { + __TBB_ASSERT(false, "Overloading for r-value reference doesn't work or it's not movable and not copyable object"); + } + + void internal_add_copy(const Item& item) override { + internal_add_copy_impl(typename std::is_copy_constructible::type(), item); + } + + void internal_add_move(Item&& item) override { + using feeder_task = feeder_item_task; + small_object_allocator alloc{}; + auto task = alloc.new_object(std::move(item), *this, alloc); + + my_wait_context.reserve(); + spawn(*task, my_execution_context); + } +public: + feeder_impl(const Body& body, wait_context& w_context, task_group_context &context) + : my_body(body), + my_wait_context(w_context) + , my_execution_context(context) + {} + + const Body& my_body; + wait_context& my_wait_context; + task_group_context& my_execution_context; +}; // class feeder_impl + +/** Execute computation under one element of the range + @ingroup algorithms **/ +template +struct for_each_iteration_task: public task { + using feeder_type = feeder_impl; + + for_each_iteration_task(Iterator input_item_ptr, const Body& body, feeder_impl* feeder_ptr, wait_context& wait_context) : + item_ptr(input_item_ptr), my_body(body), my_feeder_ptr(feeder_ptr), parent_wait_context(wait_context) + {} + + void finalize() { + parent_wait_context.release(); + } + + task* execute(execution_data&) override { + parallel_for_each_operator_selector::call(my_body, *item_ptr, my_feeder_ptr); + finalize(); + return nullptr; + } + + task* cancel(execution_data&) override { + finalize(); + return nullptr; + } + + Iterator item_ptr; + const Body& my_body; + feeder_impl* my_feeder_ptr; + wait_context& parent_wait_context; +}; // class for_each_iteration_task + +// Helper to get the type of the iterator to the internal sequence of copies +// If the element can be passed to the body as an rvalue - this iterator should be move_iterator +template +struct input_iteration_task_iterator_helper { + // For input iterators we pass const lvalue reference to the body + // It is prohibited to take non-constant lvalue references for input iterators + using type = const Item*; +}; + +template +struct input_iteration_task_iterator_helper::call(std::declval(), + std::declval(), + std::declval*>()))>> +{ + using type = std::move_iterator; +}; + +/** Split one block task to several(max_block_size) iteration tasks for input iterators + @ingroup algorithms **/ +template +struct input_block_handling_task : public task { + static constexpr size_t max_block_size = 4; + + using feeder_type = feeder_impl; + using iteration_task_iterator_type = typename input_iteration_task_iterator_helper::type; + using iteration_task = for_each_iteration_task; + + input_block_handling_task(wait_context& root_wait_context, task_group_context& e_context, + const Body& body, feeder_impl* feeder_ptr, small_object_allocator& alloc) + :my_size(0), my_wait_context(0), my_root_wait_context(root_wait_context), + my_execution_context(e_context), my_allocator(alloc) + { + auto item_it = block_iteration_space.begin(); + for (auto* it = task_pool.begin(); it != task_pool.end(); ++it) { + new (it) iteration_task(iteration_task_iterator_type(item_it++), body, feeder_ptr, my_wait_context); + } + } + + void finalize(const execution_data& ed) { + my_root_wait_context.release(); + my_allocator.delete_object(this, ed); + } + + task* execute(execution_data& ed) override { + __TBB_ASSERT( my_size > 0, "Negative size was passed to task"); + for (std::size_t counter = 1; counter < my_size; ++counter) { + my_wait_context.reserve(); + spawn(*(task_pool.begin() + counter), my_execution_context); + } + my_wait_context.reserve(); + execute_and_wait(*task_pool.begin(), my_execution_context, + my_wait_context, my_execution_context); + + // deallocate current task after children execution + finalize(ed); + return nullptr; + } + + task* cancel(execution_data& ed) override { + finalize(ed); + return nullptr; + } + + ~input_block_handling_task() { + for(std::size_t counter = 0; counter < max_block_size; ++counter) { + (task_pool.begin() + counter)->~iteration_task(); + if (counter < my_size) { + (block_iteration_space.begin() + counter)->~Item(); + } + } + } + + aligned_space block_iteration_space; + aligned_space task_pool; + std::size_t my_size; + wait_context my_wait_context; + wait_context& my_root_wait_context; + task_group_context& my_execution_context; + small_object_allocator my_allocator; +}; // class input_block_handling_task + +/** Split one block task to several(max_block_size) iteration tasks for forward iterators + @ingroup algorithms **/ +template +struct forward_block_handling_task : public task { + static constexpr size_t max_block_size = 4; + + using iteration_task = for_each_iteration_task; + + forward_block_handling_task(Iterator first, std::size_t size, + wait_context& w_context, task_group_context& e_context, + const Body& body, feeder_impl* feeder_ptr, + small_object_allocator& alloc) + : my_size(size), my_wait_context(0), my_root_wait_context(w_context), + my_execution_context(e_context), my_allocator(alloc) + { + auto* task_it = task_pool.begin(); + for (std::size_t i = 0; i < size; i++) { + new (task_it++) iteration_task(first, body, feeder_ptr, my_wait_context); + ++first; + } + } + + void finalize(const execution_data& ed) { + my_root_wait_context.release(); + my_allocator.delete_object(this, ed); + } + + task* execute(execution_data& ed) override { + __TBB_ASSERT( my_size > 0, "Negative size was passed to task"); + for(std::size_t counter = 1; counter < my_size; ++counter) { + my_wait_context.reserve(); + spawn(*(task_pool.begin() + counter), my_execution_context); + } + my_wait_context.reserve(); + execute_and_wait(*task_pool.begin(), my_execution_context, + my_wait_context, my_execution_context); + + // deallocate current task after children execution + finalize(ed); + return nullptr; + } + + task* cancel(execution_data& ed) override { + finalize(ed); + return nullptr; + } + + ~forward_block_handling_task() { + for(std::size_t counter = 0; counter < my_size; ++counter) { + (task_pool.begin() + counter)->~iteration_task(); + } + } + + aligned_space task_pool; + std::size_t my_size; + wait_context my_wait_context; + wait_context& my_root_wait_context; + task_group_context& my_execution_context; + small_object_allocator my_allocator; +}; // class forward_block_handling_task + +/** Body for parallel_for algorithm. + * Allows to redirect operations under random access iterators range to the parallel_for algorithm. + @ingroup algorithms **/ +template +class parallel_for_body_wrapper { + Iterator my_first; + const Body& my_body; + feeder_impl* my_feeder_ptr; +public: + parallel_for_body_wrapper(Iterator first, const Body& body, feeder_impl* feeder_ptr) + : my_first(first), my_body(body), my_feeder_ptr(feeder_ptr) {} + + void operator()(tbb::blocked_range range) const { +#if __INTEL_COMPILER +#pragma ivdep +#endif + for (std::size_t count = range.begin(); count != range.end(); count++) { + parallel_for_each_operator_selector::call(my_body, *(my_first + count), + my_feeder_ptr); + } + } +}; // class parallel_for_body_wrapper + + +/** Helper for getting iterators tag including inherited custom tags + @ingroup algorithms */ +template +using tag = typename std::iterator_traits::iterator_category; + +template +using iterator_tag_dispatch = typename + std::conditional< + std::is_base_of>::value, + std::random_access_iterator_tag, + typename std::conditional< + std::is_base_of>::value, + std::forward_iterator_tag, + std::input_iterator_tag + >::type + >::type; + +template +using feeder_is_required = tbb::detail::void_t(), + std::declval::reference>(), + std::declval&>()))>; + +// Creates feeder object only if the body can accept it +template +struct feeder_holder { + feeder_holder( wait_context&, task_group_context&, const Body& ) {} + + feeder_impl* feeder_ptr() { return nullptr; } +}; // class feeder_holder + +template +class feeder_holder> { +public: + feeder_holder( wait_context& w_context, task_group_context& context, const Body& body ) + : my_feeder(body, w_context, context) {} + + feeder_impl* feeder_ptr() { return &my_feeder; } +private: + feeder_impl my_feeder; +}; // class feeder_holder + +template +class for_each_root_task_base : public task { +public: + for_each_root_task_base(Iterator first, Iterator last, const Body& body, wait_context& w_context, task_group_context& e_context) + : my_first(first), my_last(last), my_wait_context(w_context), my_execution_context(e_context), + my_body(body), my_feeder_holder(my_wait_context, my_execution_context, my_body) + { + my_wait_context.reserve(); + } +private: + task* cancel(execution_data&) override { + this->my_wait_context.release(); + return nullptr; + } +protected: + Iterator my_first; + Iterator my_last; + wait_context& my_wait_context; + task_group_context& my_execution_context; + const Body& my_body; + feeder_holder my_feeder_holder; +}; // class for_each_root_task_base + +/** parallel_for_each algorithm root task - most generic version + * Splits input range to blocks + @ingroup algorithms **/ +template > +class for_each_root_task : public for_each_root_task_base +{ + using base_type = for_each_root_task_base; +public: + using base_type::base_type; +private: + task* execute(execution_data& ed) override { + using block_handling_type = input_block_handling_task; + + if (this->my_first == this->my_last) { + this->my_wait_context.release(); + return nullptr; + } + + this->my_wait_context.reserve(); + small_object_allocator alloc{}; + auto block_handling_task = alloc.new_object(ed, this->my_wait_context, this->my_execution_context, + this->my_body, this->my_feeder_holder.feeder_ptr(), + alloc); + + auto* block_iterator = block_handling_task->block_iteration_space.begin(); + for (; !(this->my_first == this->my_last) && block_handling_task->my_size < block_handling_type::max_block_size; ++this->my_first) { + // Move semantics are automatically used when supported by the iterator + new (block_iterator++) Item(*this->my_first); + ++block_handling_task->my_size; + } + + // Do not access this after spawn to avoid races + spawn(*this, this->my_execution_context); + return block_handling_task; + } +}; // class for_each_root_task - most generic implementation + +/** parallel_for_each algorithm root task - forward iterator based specialization + * Splits input range to blocks + @ingroup algorithms **/ +template +class for_each_root_task + : public for_each_root_task_base +{ + using base_type = for_each_root_task_base; +public: + using base_type::base_type; +private: + task* execute(execution_data& ed) override { + using block_handling_type = forward_block_handling_task; + if (this->my_first == this->my_last) { + this->my_wait_context.release(); + return nullptr; + } + + std::size_t block_size{0}; + Iterator first_block_element = this->my_first; + for (; !(this->my_first == this->my_last) && block_size < block_handling_type::max_block_size; ++this->my_first) { + ++block_size; + } + + this->my_wait_context.reserve(); + small_object_allocator alloc{}; + auto block_handling_task = alloc.new_object(ed, first_block_element, block_size, + this->my_wait_context, this->my_execution_context, + this->my_body, this->my_feeder_holder.feeder_ptr(), alloc); + + // Do not access this after spawn to avoid races + spawn(*this, this->my_execution_context); + return block_handling_task; + } +}; // class for_each_root_task - forward iterator based specialization + +/** parallel_for_each algorithm root task - random access iterator based specialization + * Splits input range to blocks + @ingroup algorithms **/ +template +class for_each_root_task + : public for_each_root_task_base +{ + using base_type = for_each_root_task_base; +public: + using base_type::base_type; +private: + task* execute(execution_data&) override { + tbb::parallel_for( + tbb::blocked_range(0, std::distance(this->my_first, this->my_last)), + parallel_for_body_wrapper(this->my_first, this->my_body, this->my_feeder_holder.feeder_ptr()) + , this->my_execution_context + ); + + this->my_wait_context.release(); + return nullptr; + } +}; // class for_each_root_task - random access iterator based specialization + +/** Helper for getting item type. If item type can be deduced from feeder - got it from feeder, + if feeder is generic - got item type from range. + @ingroup algorithms */ +template +auto feeder_argument_parser(void (Body::*)(Item, feeder&) const) -> FeederArg; + +template +decltype(feeder_argument_parser(&Body::operator())) get_item_type_impl(int); // for (T, feeder) +template Item get_item_type_impl(...); // stub + +template +using get_item_type = decltype(get_item_type_impl(0)); + +#if __TBB_CPP20_CONCEPTS_PRESENT +template +using feeder_item_type = std::remove_cvref_t>; + +template +concept parallel_for_each_iterator_body = + parallel_for_each_body, feeder_item_type>>; + +template +concept parallel_for_each_range_body = + parallel_for_each_body, feeder_item_type>>; +#endif + +/** Implements parallel iteration over a range. + @ingroup algorithms */ +template +void run_parallel_for_each( Iterator first, Iterator last, const Body& body, task_group_context& context) +{ + if (!(first == last)) { + using ItemType = get_item_type::value_type>; + wait_context w_context(0); + + for_each_root_task root_task(first, last, body, w_context, context); + + execute_and_wait(root_task, context, w_context, context); + } +} + +/** \page parallel_for_each_body_req Requirements on parallel_for_each body + Class \c Body implementing the concept of parallel_for_each body must define: + - \code + B::operator()( + cv_item_type item, + feeder& feeder + ) const + + OR + + B::operator()( cv_item_type& item ) const + \endcode Process item. + May be invoked concurrently for the same \c this but different \c item. + + - \code item_type( const item_type& ) \endcode + Copy a work item. + - \code ~item_type() \endcode Destroy a work item +**/ + +/** \name parallel_for_each + See also requirements on \ref parallel_for_each_body_req "parallel_for_each Body". **/ +//@{ +//! Parallel iteration over a range, with optional addition of more work. +/** @ingroup algorithms */ +template + __TBB_requires(std::input_iterator && parallel_for_each_iterator_body) +void parallel_for_each(Iterator first, Iterator last, const Body& body) { + task_group_context context(PARALLEL_FOR_EACH); + run_parallel_for_each(first, last, body, context); +} + +template + __TBB_requires(container_based_sequence && parallel_for_each_range_body) +void parallel_for_each(Range& rng, const Body& body) { + parallel_for_each(std::begin(rng), std::end(rng), body); +} + +template + __TBB_requires(container_based_sequence && parallel_for_each_range_body) +void parallel_for_each(const Range& rng, const Body& body) { + parallel_for_each(std::begin(rng), std::end(rng), body); +} + +//! Parallel iteration over a range, with optional addition of more work and user-supplied context +/** @ingroup algorithms */ +template + __TBB_requires(std::input_iterator && parallel_for_each_iterator_body) +void parallel_for_each(Iterator first, Iterator last, const Body& body, task_group_context& context) { + run_parallel_for_each(first, last, body, context); +} + +template + __TBB_requires(container_based_sequence && parallel_for_each_range_body) +void parallel_for_each(Range& rng, const Body& body, task_group_context& context) { + parallel_for_each(std::begin(rng), std::end(rng), body, context); +} + +template + __TBB_requires(container_based_sequence && parallel_for_each_range_body) +void parallel_for_each(const Range& rng, const Body& body, task_group_context& context) { + parallel_for_each(std::begin(rng), std::end(rng), body, context); +} + +} // namespace d2 +} // namespace detail +//! @endcond +//@} + +inline namespace v1 { +using detail::d2::parallel_for_each; +using detail::d1::feeder; +} // namespace v1 + +} // namespace tbb + +#endif /* __TBB_parallel_for_each_H */ diff --git a/third_party/tbb/parallel_invoke.h b/third_party/tbb/parallel_invoke.h new file mode 100644 index 000000000..899c57cba --- /dev/null +++ b/third_party/tbb/parallel_invoke.h @@ -0,0 +1,228 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_parallel_invoke_H +#define __TBB_parallel_invoke_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_small_object_pool.h" + +#include "third_party/tbb/task_group.h" + +#include "third_party/libcxx/tuple" +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/utility" + +namespace tbb { +namespace detail { +namespace d1 { + +//! Simple task object, executing user method +template +struct function_invoker : public task { + function_invoker(const Function& function, WaitObject& wait_ctx) : + my_function(function), + parent_wait_ctx(wait_ctx) + {} + + task* execute(execution_data& ed) override { + my_function(); + parent_wait_ctx.release(ed); + call_itt_task_notify(destroy, this); + return nullptr; + } + + task* cancel(execution_data& ed) override { + parent_wait_ctx.release(ed); + return nullptr; + } + + const Function& my_function; + WaitObject& parent_wait_ctx; +}; // struct function_invoker + +//! Task object for managing subroots in trinary task trees. +// Endowed with additional synchronization logic (compatible with wait object intefaces) to support +// continuation passing execution. This task spawns 2 function_invoker tasks with first and second functors +// and then executes first functor by itself. But only the last executed functor must destruct and deallocate +// the subroot task. +template +struct invoke_subroot_task : public task { + wait_context& root_wait_ctx; + std::atomic ref_count{0}; + bool child_spawned = false; + + const F1& self_invoked_functor; + function_invoker> f2_invoker; + function_invoker> f3_invoker; + + task_group_context& my_execution_context; + small_object_allocator my_allocator; + + invoke_subroot_task(const F1& f1, const F2& f2, const F3& f3, wait_context& wait_ctx, task_group_context& context, + small_object_allocator& alloc) : + root_wait_ctx(wait_ctx), + self_invoked_functor(f1), + f2_invoker(f2, *this), + f3_invoker(f3, *this), + my_execution_context(context), + my_allocator(alloc) + { + root_wait_ctx.reserve(); + } + + void finalize(const execution_data& ed) { + root_wait_ctx.release(); + + my_allocator.delete_object(this, ed); + } + + void release(const execution_data& ed) { + __TBB_ASSERT(ref_count > 0, nullptr); + call_itt_task_notify(releasing, this); + if( --ref_count == 0 ) { + call_itt_task_notify(acquired, this); + finalize(ed); + } + } + + task* execute(execution_data& ed) override { + ref_count.fetch_add(3, std::memory_order_relaxed); + spawn(f3_invoker, my_execution_context); + spawn(f2_invoker, my_execution_context); + self_invoked_functor(); + + release(ed); + return nullptr; + } + + task* cancel(execution_data& ed) override { + if( ref_count > 0 ) { // detect children spawn + release(ed); + } else { + finalize(ed); + } + return nullptr; + } +}; // struct subroot_task + +class invoke_root_task { +public: + invoke_root_task(wait_context& wc) : my_wait_context(wc) {} + void release(const execution_data&) { + my_wait_context.release(); + } +private: + wait_context& my_wait_context; +}; + +template +void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1) { + root_wait_ctx.reserve(1); + invoke_root_task root(root_wait_ctx); + function_invoker invoker1(f1, root); + + execute_and_wait(invoker1, context, root_wait_ctx, context); +} + +template +void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1, const F2& f2) { + root_wait_ctx.reserve(2); + invoke_root_task root(root_wait_ctx); + function_invoker invoker1(f1, root); + function_invoker invoker2(f2, root); + + spawn(invoker1, context); + execute_and_wait(invoker2, context, root_wait_ctx, context); +} + +template +void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1, const F2& f2, const F3& f3) { + root_wait_ctx.reserve(3); + invoke_root_task root(root_wait_ctx); + function_invoker invoker1(f1, root); + function_invoker invoker2(f2, root); + function_invoker invoker3(f3, root); + + //TODO: implement sub root for two tasks (measure performance) + spawn(invoker1, context); + spawn(invoker2, context); + execute_and_wait(invoker3, context, root_wait_ctx, context); +} + +template +void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, + const F1& f1, const F2& f2, const F3& f3, const Fs&... fs) { + small_object_allocator alloc{}; + auto sub_root = alloc.new_object>(f1, f2, f3, root_wait_ctx, context, alloc); + spawn(*sub_root, context); + + invoke_recursive_separation(root_wait_ctx, context, fs...); +} + +template +void parallel_invoke_impl(task_group_context& context, const Fs&... fs) { + static_assert(sizeof...(Fs) >= 2, "Parallel invoke may be called with at least two callable"); + wait_context root_wait_ctx{0}; + + invoke_recursive_separation(root_wait_ctx, context, fs...); +} + +template +void parallel_invoke_impl(const F1& f1, const Fs&... fs) { + static_assert(sizeof...(Fs) >= 1, "Parallel invoke may be called with at least two callable"); + task_group_context context(PARALLEL_INVOKE); + wait_context root_wait_ctx{0}; + + invoke_recursive_separation(root_wait_ctx, context, fs..., f1); +} + +//! Passes last argument of variadic pack as first for handling user provided task_group_context +template +struct invoke_helper; + +template +struct invoke_helper, T, Fs...> : invoke_helper, Fs...> {}; + +template +struct invoke_helper, T> { + void operator()(Fs&&... args, T&& t) { + parallel_invoke_impl(std::forward(t), std::forward(args)...); + } +}; + +//! Parallel execution of several function objects +// We need to pass parameter pack through forwarding reference, +// since this pack may contain task_group_context that must be passed via lvalue non-const reference +template +void parallel_invoke(Fs&&... fs) { + invoke_helper, Fs...>()(std::forward(fs)...); +} + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::parallel_invoke; +} // namespace v1 + +} // namespace tbb +#endif /* __TBB_parallel_invoke_H */ diff --git a/third_party/tbb/parallel_pipeline.cpp b/third_party/tbb/parallel_pipeline.cpp new file mode 100644 index 000000000..dd0b4b651 --- /dev/null +++ b/third_party/tbb/parallel_pipeline.cpp @@ -0,0 +1,472 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/parallel_pipeline.h" +#include "third_party/tbb/spin_mutex.h" +#include "third_party/tbb/tbb_allocator.h" +#include "third_party/tbb/cache_aligned_allocator.h" +#include "third_party/tbb/itt_notify.h" +#include "third_party/tbb/tls.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_small_object_pool.h" + +namespace tbb { +namespace detail { +namespace r1 { + +void handle_perror(int error_code, const char* aux_info); + +using Token = unsigned long; + +//! A processing pipeline that applies filters to items. +/** @ingroup algorithms */ +class pipeline { + friend void parallel_pipeline(d1::task_group_context&, std::size_t, const d1::filter_node&); +public: + + //! Construct empty pipeline. + pipeline(d1::task_group_context& cxt, std::size_t max_token) : + my_context(cxt), + first_filter(nullptr), + last_filter(nullptr), + input_tokens(Token(max_token)), + end_of_input(false), + wait_ctx(0) { + __TBB_ASSERT( max_token>0, "pipeline::run must have at least one token" ); + } + + ~pipeline(); + + //! Add filter to end of pipeline. + void add_filter( d1::base_filter& ); + + //! Traverse tree of fitler-node in-order and add filter for each leaf + void fill_pipeline(const d1::filter_node& root) { + if( root.left && root.right ) { + fill_pipeline(*root.left); + fill_pipeline(*root.right); + } + else { + __TBB_ASSERT(!root.left && !root.right, "tree should be full"); + add_filter(*root.create_filter()); + } + } + +private: + friend class stage_task; + friend class base_filter; + friend void set_end_of_input(d1::base_filter& bf); + + task_group_context& my_context; + + //! Pointer to first filter in the pipeline. + d1::base_filter* first_filter; + + //! Pointer to last filter in the pipeline. + d1::base_filter* last_filter; + + //! Number of idle tokens waiting for input stage. + std::atomic input_tokens; + + //! False until flow_control::stop() is called. + std::atomic end_of_input; + + d1::wait_context wait_ctx; +}; + +//! This structure is used to store task information in an input buffer +struct task_info { + void* my_object = nullptr; + //! Invalid unless a task went through an ordered stage. + Token my_token = 0; + //! False until my_token is set. + bool my_token_ready = false; + //! True if my_object is valid. + bool is_valid = false; + //! Set to initial state (no object, no token) + void reset() { + my_object = nullptr; + my_token = 0; + my_token_ready = false; + is_valid = false; + } +}; + +//! A buffer of input items for a filter. +/** Each item is a task_info, inserted into a position in the buffer corresponding to a Token. */ +class input_buffer { + friend class base_filter; + friend class stage_task; + friend class pipeline; + friend void set_end_of_input(d1::base_filter& bf); + + using size_type = Token; + + //! Array of deferred tasks that cannot yet start executing. + task_info* array; + + //! Size of array + /** Always 0 or a power of 2 */ + size_type array_size; + + //! Lowest token that can start executing. + /** All prior Token have already been seen. */ + Token low_token; + + //! Serializes updates. + spin_mutex array_mutex; + + //! Resize "array". + /** Caller is responsible to acquiring a lock on "array_mutex". */ + void grow( size_type minimum_size ); + + //! Initial size for "array" + /** Must be a power of 2 */ + static const size_type initial_buffer_size = 4; + + //! Used for out of order buffer, and for assigning my_token if is_ordered and my_token not already assigned + Token high_token; + + //! True for ordered filter, false otherwise. + const bool is_ordered; + + //! for parallel filters that accepts nullptrs, thread-local flag for reaching end_of_input + using end_of_input_tls_t = basic_tls; + end_of_input_tls_t end_of_input_tls; + bool end_of_input_tls_allocated; // no way to test pthread creation of TLS + +public: + input_buffer(const input_buffer&) = delete; + input_buffer& operator=(const input_buffer&) = delete; + + //! Construct empty buffer. + input_buffer( bool ordered) : + array(nullptr), + array_size(0), + low_token(0), + high_token(0), + is_ordered(ordered), + end_of_input_tls(), + end_of_input_tls_allocated(false) { + grow(initial_buffer_size); + __TBB_ASSERT( array, nullptr ); + } + + //! Destroy the buffer. + ~input_buffer() { + __TBB_ASSERT( array, nullptr ); + cache_aligned_allocator().deallocate(array,array_size); + poison_pointer( array ); + if( end_of_input_tls_allocated ) { + destroy_my_tls(); + } + } + + //! Define order when the first filter is serial_in_order. + Token get_ordered_token(){ + return high_token++; + } + + //! Put a token into the buffer. + /** If task information was placed into buffer, returns true; + otherwise returns false, informing the caller to create and spawn a task. + */ + bool try_put_token( task_info& info ) { + info.is_valid = true; + spin_mutex::scoped_lock lock( array_mutex ); + Token token; + if( is_ordered ) { + if( !info.my_token_ready ) { + info.my_token = high_token++; + info.my_token_ready = true; + } + token = info.my_token; + } else + token = high_token++; + __TBB_ASSERT( (long)(token-low_token)>=0, nullptr ); + if( token!=low_token ) { + // Trying to put token that is beyond low_token. + // Need to wait until low_token catches up before dispatching. + if( token-low_token>=array_size ) + grow( token-low_token+1 ); + ITT_NOTIFY( sync_releasing, this ); + array[token&(array_size-1)] = info; + return true; + } + return false; + } + + //! Note that processing of a token is finished. + /** Fires up processing of the next token, if processing was deferred. */ + // Uses template to avoid explicit dependency on stage_task. + template + void try_to_spawn_task_for_next_token(StageTask& spawner, d1::execution_data& ed) { + task_info wakee; + { + spin_mutex::scoped_lock lock( array_mutex ); + // Wake the next task + task_info& item = array[++low_token & (array_size-1)]; + ITT_NOTIFY( sync_acquired, this ); + wakee = item; + item.is_valid = false; + } + if( wakee.is_valid ) + spawner.spawn_stage_task(wakee, ed); + } + + // end_of_input signal for parallel_pipeline, parallel input filters with 0 tokens allowed. + void create_my_tls() { + int status = end_of_input_tls.create(); + if(status) + handle_perror(status, "TLS not allocated for filter"); + end_of_input_tls_allocated = true; + } + void destroy_my_tls() { + int status = end_of_input_tls.destroy(); + if(status) + handle_perror(status, "Failed to destroy filter TLS"); + } + bool my_tls_end_of_input() { + return end_of_input_tls.get() != nullptr; + } + void set_my_tls_end_of_input() { + end_of_input_tls.set(this); + } +}; + +void input_buffer::grow( size_type minimum_size ) { + size_type old_size = array_size; + size_type new_size = old_size ? 2*old_size : initial_buffer_size; + while( new_size().allocate(new_size); + task_info* old_array = array; + for( size_type i=0; i().deallocate(old_array,old_size); +} + +class stage_task : public d1::task, public task_info { +private: + friend class pipeline; + pipeline& my_pipeline; + d1::base_filter* my_filter; + d1::small_object_allocator m_allocator; + //! True if this task has not yet read the input. + bool my_at_start; + + //! True if this can be executed again. + bool execute_filter(d1::execution_data& ed); + + //! Spawn task if token is available. + void try_spawn_stage_task(d1::execution_data& ed) { + ITT_NOTIFY( sync_releasing, &my_pipeline.input_tokens ); + if( (my_pipeline.input_tokens.fetch_sub(1, std::memory_order_release)) > 1 ) { + d1::small_object_allocator alloc{}; + r1::spawn( *alloc.new_object(ed, my_pipeline, alloc ), my_pipeline.my_context ); + } + } + +public: + + //! Construct stage_task for first stage in a pipeline. + /** Such a stage has not read any input yet. */ + stage_task(pipeline& pipeline, d1::small_object_allocator& alloc ) : + my_pipeline(pipeline), + my_filter(pipeline.first_filter), + m_allocator(alloc), + my_at_start(true) + { + task_info::reset(); + my_pipeline.wait_ctx.reserve(); + } + //! Construct stage_task for a subsequent stage in a pipeline. + stage_task(pipeline& pipeline, d1::base_filter* filter, const task_info& info, d1::small_object_allocator& alloc) : + task_info(info), + my_pipeline(pipeline), + my_filter(filter), + m_allocator(alloc), + my_at_start(false) + { + my_pipeline.wait_ctx.reserve(); + } + //! Roughly equivalent to the constructor of input stage task + void reset() { + task_info::reset(); + my_filter = my_pipeline.first_filter; + my_at_start = true; + } + void finalize(d1::execution_data& ed) { + m_allocator.delete_object(this, ed); + } + //! The virtual task execution method + task* execute(d1::execution_data& ed) override { + if(!execute_filter(ed)) { + finalize(ed); + return nullptr; + } + return this; + } + task* cancel(d1::execution_data& ed) override { + finalize(ed); + return nullptr; + } + + ~stage_task() override { + if ( my_filter && my_object ) { + my_filter->finalize(my_object); + my_object = nullptr; + } + my_pipeline.wait_ctx.release(); + } + //! Creates and spawns stage_task from task_info + void spawn_stage_task(const task_info& info, d1::execution_data& ed) { + d1::small_object_allocator alloc{}; + stage_task* clone = alloc.new_object(ed, my_pipeline, my_filter, info, alloc); + r1::spawn(*clone, my_pipeline.my_context); + } +}; + +bool stage_task::execute_filter(d1::execution_data& ed) { + __TBB_ASSERT( !my_at_start || !my_object, "invalid state of task" ); + if( my_at_start ) { + if( my_filter->is_serial() ) { + my_object = (*my_filter)(my_object); + if( my_object || ( my_filter->object_may_be_null() && !my_pipeline.end_of_input.load(std::memory_order_relaxed)) ) { + if( my_filter->is_ordered() ) { + my_token = my_filter->my_input_buffer->get_ordered_token(); + my_token_ready = true; + } + if( !my_filter->next_filter_in_pipeline ) { // we're only filter in pipeline + reset(); + return true; + } else { + try_spawn_stage_task(ed); + } + } else { + my_pipeline.end_of_input.store(true, std::memory_order_relaxed); + return false; + } + } else /*not is_serial*/ { + if ( my_pipeline.end_of_input.load(std::memory_order_relaxed) ) { + return false; + } + + try_spawn_stage_task(ed); + + my_object = (*my_filter)(my_object); + if( !my_object && (!my_filter->object_may_be_null() || my_filter->my_input_buffer->my_tls_end_of_input()) ){ + my_pipeline.end_of_input.store(true, std::memory_order_relaxed); + return false; + } + } + my_at_start = false; + } else { + my_object = (*my_filter)(my_object); + if( my_filter->is_serial() ) + my_filter->my_input_buffer->try_to_spawn_task_for_next_token(*this, ed); + } + my_filter = my_filter->next_filter_in_pipeline; + if( my_filter ) { + // There is another filter to execute. + if( my_filter->is_serial() ) { + // The next filter must execute tokens when they are available (in order for serial_in_order) + if( my_filter->my_input_buffer->try_put_token(*this) ){ + my_filter = nullptr; // To prevent deleting my_object twice if exception occurs + return false; + } + } + } else { + // Reached end of the pipe. + std::size_t ntokens_avail = my_pipeline.input_tokens.fetch_add(1, std::memory_order_acquire); + + if( ntokens_avail>0 // Only recycle if there is one available token + || my_pipeline.end_of_input.load(std::memory_order_relaxed) ) { + return false; // No need to recycle for new input + } + ITT_NOTIFY( sync_acquired, &my_pipeline.input_tokens ); + // Recycle as an input stage task. + reset(); + } + return true; +} + +pipeline::~pipeline() { + while( first_filter ) { + d1::base_filter* f = first_filter; + if( input_buffer* b = f->my_input_buffer ) { + b->~input_buffer(); + deallocate_memory(b); + } + first_filter = f->next_filter_in_pipeline; + f->~base_filter(); + deallocate_memory(f); + } +} + +void pipeline::add_filter( d1::base_filter& new_fitler ) { + __TBB_ASSERT( new_fitler.next_filter_in_pipeline==d1::base_filter::not_in_pipeline(), "filter already part of pipeline?" ); + new_fitler.my_pipeline = this; + if ( first_filter == nullptr ) + first_filter = &new_fitler; + else + last_filter->next_filter_in_pipeline = &new_fitler; + new_fitler.next_filter_in_pipeline = nullptr; + last_filter = &new_fitler; + if( new_fitler.is_serial() ) { + new_fitler.my_input_buffer = new (allocate_memory(sizeof(input_buffer))) input_buffer( new_fitler.is_ordered() ); + } else { + if( first_filter == &new_fitler && new_fitler.object_may_be_null() ) { + //TODO: buffer only needed to hold TLS; could improve + new_fitler.my_input_buffer = new (allocate_memory(sizeof(input_buffer))) input_buffer( /*is_ordered*/false ); + new_fitler.my_input_buffer->create_my_tls(); + } + } +} + +void __TBB_EXPORTED_FUNC parallel_pipeline(d1::task_group_context& cxt, std::size_t max_token, const d1::filter_node& fn) { + pipeline pipe(cxt, max_token); + + pipe.fill_pipeline(fn); + + d1::small_object_allocator alloc{}; + stage_task& st = *alloc.new_object(pipe, alloc); + + // Start execution of tasks + r1::execute_and_wait(st, cxt, pipe.wait_ctx, cxt); +} + +void __TBB_EXPORTED_FUNC set_end_of_input(d1::base_filter& bf) { + __TBB_ASSERT(bf.my_input_buffer, nullptr); + __TBB_ASSERT(bf.object_may_be_null(), nullptr); + if(bf.is_serial() ) { + bf.my_pipeline->end_of_input.store(true, std::memory_order_relaxed); + } else { + __TBB_ASSERT(bf.my_input_buffer->end_of_input_tls_allocated, nullptr); + bf.my_input_buffer->set_my_tls_end_of_input(); + } +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/parallel_pipeline.h b/third_party/tbb/parallel_pipeline.h new file mode 100644 index 000000000..3cc24afe4 --- /dev/null +++ b/third_party/tbb/parallel_pipeline.h @@ -0,0 +1,154 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_parallel_pipeline_H +#define __TBB_parallel_pipeline_H + +#include "third_party/tbb/detail/_pipeline_filters.h" +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/task_group.h" + +#include "third_party/libcxx/cstddef" +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/type_traits" + +namespace tbb { +namespace detail { + +namespace r1 { +TBB_EXPORT void __TBB_EXPORTED_FUNC parallel_pipeline(task_group_context&, std::size_t, const d1::filter_node&); +} + +namespace d1 { + +enum class filter_mode : unsigned int +{ + //! processes multiple items in parallel and in no particular order + parallel = base_filter::filter_is_out_of_order, + //! processes items one at a time; all such filters process items in the same order + serial_in_order = base_filter::filter_is_serial, + //! processes items one at a time and in no particular order + serial_out_of_order = base_filter::filter_is_serial | base_filter::filter_is_out_of_order +}; +//! Class representing a chain of type-safe pipeline filters +/** @ingroup algorithms */ +template +class filter { + filter_node_ptr my_root; + filter( filter_node_ptr root ) : my_root(root) {} + friend void parallel_pipeline( size_t, const filter&, task_group_context& ); + template + friend filter make_filter( filter_mode, const Body& ); + template + friend filter operator&( const filter&, const filter& ); +public: + filter() = default; + filter( const filter& rhs ) : my_root(rhs.my_root) {} + filter( filter&& rhs ) : my_root(std::move(rhs.my_root)) {} + + void operator=(const filter& rhs) { + my_root = rhs.my_root; + } + void operator=( filter&& rhs ) { + my_root = std::move(rhs.my_root); + } + + template + filter( filter_mode mode, const Body& body ) : + my_root( new(r1::allocate_memory(sizeof(filter_node_leaf))) + filter_node_leaf(static_cast(mode), body) ) { + } + + filter& operator&=( const filter& right ) { + *this = *this & right; + return *this; + } + + void clear() { + // Like operator= with filter() on right side. + my_root = nullptr; + } +}; + +//! Create a filter to participate in parallel_pipeline +/** @ingroup algorithms */ +template +filter make_filter( filter_mode mode, const Body& body ) { + return filter_node_ptr( new(r1::allocate_memory(sizeof(filter_node_leaf))) + filter_node_leaf(static_cast(mode), body) ); +} + +//! Create a filter to participate in parallel_pipeline +/** @ingroup algorithms */ +template +filter, filter_output> make_filter( filter_mode mode, const Body& body ) { + return make_filter, filter_output>(mode, body); +} + +//! Composition of filters left and right. +/** @ingroup algorithms */ +template +filter operator&( const filter& left, const filter& right ) { + __TBB_ASSERT(left.my_root,"cannot use default-constructed filter as left argument of '&'"); + __TBB_ASSERT(right.my_root,"cannot use default-constructed filter as right argument of '&'"); + return filter_node_ptr( new (r1::allocate_memory(sizeof(filter_node))) filter_node(left.my_root,right.my_root) ); +} + +#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT +template +filter(filter_mode, Body) +->filter, filter_output>; +#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT + +//! Parallel pipeline over chain of filters with user-supplied context. +/** @ingroup algorithms **/ +inline void parallel_pipeline(size_t max_number_of_live_tokens, const filter& filter_chain, task_group_context& context) { + r1::parallel_pipeline(context, max_number_of_live_tokens, *filter_chain.my_root); +} + +//! Parallel pipeline over chain of filters. +/** @ingroup algorithms **/ +inline void parallel_pipeline(size_t max_number_of_live_tokens, const filter& filter_chain) { + task_group_context context; + parallel_pipeline(max_number_of_live_tokens, filter_chain, context); +} + +//! Parallel pipeline over sequence of filters. +/** @ingroup algorithms **/ +template +void parallel_pipeline(size_t max_number_of_live_tokens, + const F1& filter1, + const F2& filter2, + FiltersContext&&... filters) { + parallel_pipeline(max_number_of_live_tokens, filter1 & filter2, std::forward(filters)...); +} + +} // namespace d1 +} // namespace detail + +inline namespace v1 +{ +using detail::d1::parallel_pipeline; +using detail::d1::filter; +using detail::d1::make_filter; +using detail::d1::filter_mode; +using detail::d1::flow_control; +} +} // tbb + +#endif /* __TBB_parallel_pipeline_H */ diff --git a/third_party/tbb/parallel_reduce.h b/third_party/tbb/parallel_reduce.h new file mode 100644 index 000000000..1fc549ce1 --- /dev/null +++ b/third_party/tbb/parallel_reduce.h @@ -0,0 +1,772 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_parallel_reduce_H +#define __TBB_parallel_reduce_H + +#include "third_party/libcxx/new" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/detail/_aligned_space.h" +#include "third_party/tbb/detail/_small_object_pool.h" +#include "third_party/tbb/detail/_range_common.h" + +#include "third_party/tbb/task_group.h" // task_group_context +#include "third_party/tbb/partitioner.h" +#include "third_party/tbb/profiling.h" + +namespace tbb { +namespace detail { +#if __TBB_CPP20_CONCEPTS_PRESENT +inline namespace d0 { + +template +concept parallel_reduce_body = splittable && + requires( Body& body, const Range& range, Body& rhs ) { + body(range); + body.join(rhs); + }; + +template +concept parallel_reduce_function = std::invocable&, + const Range&, const Value&> && + std::convertible_to&, + const Range&, const Value&>, + Value>; + +template +concept parallel_reduce_combine = std::invocable&, + const Value&, const Value&> && + std::convertible_to&, + const Value&, const Value&>, + Value>; + +} // namespace d0 +#endif // __TBB_CPP20_CONCEPTS_PRESENT +namespace d1 { + +//! Tree node type for parallel_reduce. +/** @ingroup algorithms */ +//TODO: consider folding tree via bypass execution(instead of manual folding) +// for better cancellation and critical tasks handling (performance measurements required). +template +struct reduction_tree_node : public tree_node { + tbb::detail::aligned_space zombie_space; + Body& left_body; + bool has_right_zombie{false}; + + reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) : + tree_node{parent, ref_count, alloc}, + left_body(input_left_body) /* gcc4.8 bug - braced-initialization doesn't work for class members of reference type */ + {} + + void join(task_group_context* context) { + if (has_right_zombie && !context->is_group_execution_cancelled()) + left_body.join(*zombie_space.begin()); + } + + ~reduction_tree_node() { + if( has_right_zombie ) zombie_space.begin()->~Body(); + } +}; + +//! Task type used to split the work of parallel_reduce. +/** @ingroup algorithms */ +template +struct start_reduce : public task { + Range my_range; + Body* my_body; + node* my_parent; + + typename Partitioner::task_partition_type my_partition; + small_object_allocator my_allocator; + bool is_right_child; + + task* execute(execution_data&) override; + task* cancel(execution_data&) override; + void finalize(const execution_data&); + + using tree_node_type = reduction_tree_node; + + //! Constructor reduce root task. + start_reduce( const Range& range, Body& body, Partitioner& partitioner, small_object_allocator& alloc ) : + my_range(range), + my_body(&body), + my_parent(nullptr), + my_partition(partitioner), + my_allocator(alloc), + is_right_child(false) {} + //! Splitting constructor used to generate children. + /** parent_ becomes left child. Newly constructed object is right child. */ + start_reduce( start_reduce& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) : + my_range(parent_.my_range, get_range_split_object(split_obj)), + my_body(parent_.my_body), + my_parent(nullptr), + my_partition(parent_.my_partition, split_obj), + my_allocator(alloc), + is_right_child(true) + { + parent_.is_right_child = false; + } + //! Construct right child from the given range as response to the demand. + /** parent_ remains left child. Newly constructed object is right child. */ + start_reduce( start_reduce& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) : + my_range(r), + my_body(parent_.my_body), + my_parent(nullptr), + my_partition(parent_.my_partition, split()), + my_allocator(alloc), + is_right_child(true) + { + my_partition.align_depth( d ); + parent_.is_right_child = false; + } + static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) { + if ( !range.empty() ) { + wait_node wn; + small_object_allocator alloc{}; + auto reduce_task = alloc.new_object(range, body, partitioner, alloc); + reduce_task->my_parent = &wn; + execute_and_wait(*reduce_task, context, wn.m_wait, context); + } + } + static void run(const Range& range, Body& body, Partitioner& partitioner) { + // Bound context prevents exceptions from body to affect nesting or sibling algorithms, + // and allows users to handle exceptions safely by wrapping parallel_reduce in the try-block. + task_group_context context(PARALLEL_REDUCE); + run(range, body, partitioner, context); + } + //! Run body for range, serves as callback for partitioner + void run_body( Range &r ) { + tbb::detail::invoke(*my_body, r); + } + + //! spawn right task, serves as callback for partitioner + void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) { + offer_work_impl(ed, *this, split_obj); + } + //! spawn right task, serves as callback for partitioner + void offer_work(const Range& r, depth_t d, execution_data& ed) { + offer_work_impl(ed, *this, r, d); + } + +private: + template + void offer_work_impl(execution_data& ed, Args&&... args) { + small_object_allocator alloc{}; + // New right child + auto right_child = alloc.new_object(ed, std::forward(args)..., alloc); + + // New root node as a continuation and ref count. Left and right child attach to the new parent. + right_child->my_parent = my_parent = alloc.new_object(ed, my_parent, 2, *my_body, alloc); + + // Spawn the right sibling + right_child->spawn_self(ed); + } + + void spawn_self(execution_data& ed) { + my_partition.spawn_task(*this, *context(ed)); + } +}; + +//! fold the tree and deallocate the task +template +void start_reduce::finalize(const execution_data& ed) { + // Get the current parent and wait object before an object destruction + node* parent = my_parent; + auto allocator = my_allocator; + // Task execution finished - destroy it + this->~start_reduce(); + // Unwind the tree decrementing the parent`s reference count + fold_tree(parent, ed); + allocator.deallocate(this, ed); +} + +//! Execute parallel_reduce task +template +task* start_reduce::execute(execution_data& ed) { + if (!is_same_affinity(ed)) { + my_partition.note_affinity(execution_slot(ed)); + } + my_partition.check_being_stolen(*this, ed); + + // The acquire barrier synchronizes the data pointed with my_body if the left + // task has already finished. + __TBB_ASSERT(my_parent, nullptr); + if( is_right_child && my_parent->m_ref_count.load(std::memory_order_acquire) == 2 ) { + tree_node_type* parent_ptr = static_cast(my_parent); + my_body = static_cast(new( parent_ptr->zombie_space.begin() ) Body(*my_body, split())); + parent_ptr->has_right_zombie = true; + } + __TBB_ASSERT(my_body != nullptr, "Incorrect body value"); + + my_partition.execute(*this, my_range, ed); + + finalize(ed); + return nullptr; +} + +//! Cancel parallel_reduce task +template +task* start_reduce::cancel(execution_data& ed) { + finalize(ed); + return nullptr; +} + +//! Tree node type for parallel_deterministic_reduce. +/** @ingroup algorithms */ +template +struct deterministic_reduction_tree_node : public tree_node { + Body right_body; + Body& left_body; + + deterministic_reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) : + tree_node{parent, ref_count, alloc}, + right_body{input_left_body, detail::split()}, + left_body(input_left_body) + {} + + void join(task_group_context* context) { + if (!context->is_group_execution_cancelled()) + left_body.join(right_body); + } +}; + +//! Task type used to split the work of parallel_deterministic_reduce. +/** @ingroup algorithms */ +template +struct start_deterministic_reduce : public task { + Range my_range; + Body& my_body; + node* my_parent; + + typename Partitioner::task_partition_type my_partition; + small_object_allocator my_allocator; + + task* execute(execution_data&) override; + task* cancel(execution_data&) override; + void finalize(const execution_data&); + + using tree_node_type = deterministic_reduction_tree_node; + + //! Constructor deterministic_reduce root task. + start_deterministic_reduce( const Range& range, Partitioner& partitioner, Body& body, small_object_allocator& alloc ) : + my_range(range), + my_body(body), + my_parent(nullptr), + my_partition(partitioner), + my_allocator(alloc) {} + //! Splitting constructor used to generate children. + /** parent_ becomes left child. Newly constructed object is right child. */ + start_deterministic_reduce( start_deterministic_reduce& parent_, typename Partitioner::split_type& split_obj, Body& body, + small_object_allocator& alloc ) : + my_range(parent_.my_range, get_range_split_object(split_obj)), + my_body(body), + my_parent(nullptr), + my_partition(parent_.my_partition, split_obj), + my_allocator(alloc) {} + static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) { + if ( !range.empty() ) { + wait_node wn; + small_object_allocator alloc{}; + auto deterministic_reduce_task = + alloc.new_object(range, partitioner, body, alloc); + deterministic_reduce_task->my_parent = &wn; + execute_and_wait(*deterministic_reduce_task, context, wn.m_wait, context); + } + } + static void run(const Range& range, Body& body, Partitioner& partitioner) { + // Bound context prevents exceptions from body to affect nesting or sibling algorithms, + // and allows users to handle exceptions safely by wrapping parallel_deterministic_reduce + // in the try-block. + task_group_context context(PARALLEL_REDUCE); + run(range, body, partitioner, context); + } + //! Run body for range, serves as callback for partitioner + void run_body( Range &r ) { + tbb::detail::invoke(my_body, r); + } + //! Spawn right task, serves as callback for partitioner + void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) { + offer_work_impl(ed, *this, split_obj); + } +private: + template + void offer_work_impl(execution_data& ed, Args&&... args) { + small_object_allocator alloc{}; + // New root node as a continuation and ref count. Left and right child attach to the new parent. Split the body. + auto new_tree_node = alloc.new_object(ed, my_parent, 2, my_body, alloc); + + // New right child + auto right_child = alloc.new_object(ed, std::forward(args)..., new_tree_node->right_body, alloc); + + right_child->my_parent = my_parent = new_tree_node; + + // Spawn the right sibling + right_child->spawn_self(ed); + } + + void spawn_self(execution_data& ed) { + my_partition.spawn_task(*this, *context(ed)); + } +}; + +//! Fold the tree and deallocate the task +template +void start_deterministic_reduce::finalize(const execution_data& ed) { + // Get the current parent and wait object before an object destruction + node* parent = my_parent; + + auto allocator = my_allocator; + // Task execution finished - destroy it + this->~start_deterministic_reduce(); + // Unwind the tree decrementing the parent`s reference count + fold_tree(parent, ed); + allocator.deallocate(this, ed); +} + +//! Execute parallel_deterministic_reduce task +template +task* start_deterministic_reduce::execute(execution_data& ed) { + if (!is_same_affinity(ed)) { + my_partition.note_affinity(execution_slot(ed)); + } + my_partition.check_being_stolen(*this, ed); + + my_partition.execute(*this, my_range, ed); + + finalize(ed); + return nullptr; +} + +//! Cancel parallel_deterministic_reduce task +template +task* start_deterministic_reduce::cancel(execution_data& ed) { + finalize(ed); + return nullptr; +} + + +//! Auxiliary class for parallel_reduce; for internal use only. +/** The adaptor class that implements \ref parallel_reduce_body_req "parallel_reduce Body" + using given \ref parallel_reduce_lambda_req "anonymous function objects". + **/ +/** @ingroup algorithms */ +template +class lambda_reduce_body { +//TODO: decide if my_real_body, my_reduction, and my_identity_element should be copied or referenced +// (might require some performance measurements) + + const Value& my_identity_element; + const RealBody& my_real_body; + const Reduction& my_reduction; + Value my_value; + lambda_reduce_body& operator= ( const lambda_reduce_body& other ); +public: + lambda_reduce_body( const Value& identity, const RealBody& body, const Reduction& reduction ) + : my_identity_element(identity) + , my_real_body(body) + , my_reduction(reduction) + , my_value(identity) + { } + lambda_reduce_body( const lambda_reduce_body& other ) = default; + lambda_reduce_body( lambda_reduce_body& other, tbb::split ) + : my_identity_element(other.my_identity_element) + , my_real_body(other.my_real_body) + , my_reduction(other.my_reduction) + , my_value(other.my_identity_element) + { } + void operator()(Range& range) { + my_value = tbb::detail::invoke(my_real_body, range, const_cast(my_value)); + } + void join( lambda_reduce_body& rhs ) { + my_value = tbb::detail::invoke(my_reduction, const_cast(my_value), + const_cast(rhs.my_value)); + } + Value result() const { + return my_value; + } +}; + + +// Requirements on Range concept are documented in blocked_range.h + +/** \page parallel_reduce_body_req Requirements on parallel_reduce body + Class \c Body implementing the concept of parallel_reduce body must define: + - \code Body::Body( Body&, split ); \endcode Splitting constructor. + Must be able to run concurrently with operator() and method \c join + - \code Body::~Body(); \endcode Destructor + - \code void Body::operator()( Range& r ); \endcode Function call operator applying body to range \c r + and accumulating the result + - \code void Body::join( Body& b ); \endcode Join results. + The result in \c b should be merged into the result of \c this +**/ + +/** \page parallel_reduce_lambda_req Requirements on parallel_reduce anonymous function objects (lambda functions) + TO BE DOCUMENTED +**/ + +/** \name parallel_reduce + See also requirements on \ref range_req "Range" and \ref parallel_reduce_body_req "parallel_reduce Body". **/ +//@{ + +//! Parallel iteration with reduction and default partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_reduce( const Range& range, Body& body ) { + start_reduce::run( range, body, __TBB_DEFAULT_PARTITIONER() ); +} + +//! Parallel iteration with reduction and simple_partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) { + start_reduce::run( range, body, partitioner ); +} + +//! Parallel iteration with reduction and auto_partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner ) { + start_reduce::run( range, body, partitioner ); +} + +//! Parallel iteration with reduction and static_partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) { + start_reduce::run( range, body, partitioner ); +} + +//! Parallel iteration with reduction and affinity_partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner ) { + start_reduce::run( range, body, partitioner ); +} + +//! Parallel iteration with reduction, default partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_reduce( const Range& range, Body& body, task_group_context& context ) { + start_reduce::run( range, body, __TBB_DEFAULT_PARTITIONER(), context ); +} + +//! Parallel iteration with reduction, simple partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) { + start_reduce::run( range, body, partitioner, context ); +} + +//! Parallel iteration with reduction, auto_partitioner and user-supplied context +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner, task_group_context& context ) { + start_reduce::run( range, body, partitioner, context ); +} + +//! Parallel iteration with reduction, static_partitioner and user-supplied context +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) { + start_reduce::run( range, body, partitioner, context ); +} + +//! Parallel iteration with reduction, affinity_partitioner and user-supplied context +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner, task_group_context& context ) { + start_reduce::run( range, body, partitioner, context ); +} +/** parallel_reduce overloads that work with anonymous function objects + (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/ + +//! Parallel iteration with reduction and default partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) { + lambda_reduce_body body(identity, real_body, reduction); + start_reduce,const __TBB_DEFAULT_PARTITIONER> + ::run(range, body, __TBB_DEFAULT_PARTITIONER() ); + return body.result(); +} + +//! Parallel iteration with reduction and simple_partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, + const simple_partitioner& partitioner ) { + lambda_reduce_body body(identity, real_body, reduction); + start_reduce,const simple_partitioner> + ::run(range, body, partitioner ); + return body.result(); +} + +//! Parallel iteration with reduction and auto_partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, + const auto_partitioner& partitioner ) { + lambda_reduce_body body(identity, real_body, reduction); + start_reduce,const auto_partitioner> + ::run( range, body, partitioner ); + return body.result(); +} + +//! Parallel iteration with reduction and static_partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, + const static_partitioner& partitioner ) { + lambda_reduce_body body(identity, real_body, reduction); + start_reduce,const static_partitioner> + ::run( range, body, partitioner ); + return body.result(); +} + +//! Parallel iteration with reduction and affinity_partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, + affinity_partitioner& partitioner ) { + lambda_reduce_body body(identity, real_body, reduction); + start_reduce,affinity_partitioner> + ::run( range, body, partitioner ); + return body.result(); +} + +//! Parallel iteration with reduction, default partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, + task_group_context& context ) { + lambda_reduce_body body(identity, real_body, reduction); + start_reduce,const __TBB_DEFAULT_PARTITIONER> + ::run( range, body, __TBB_DEFAULT_PARTITIONER(), context ); + return body.result(); +} + +//! Parallel iteration with reduction, simple partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, + const simple_partitioner& partitioner, task_group_context& context ) { + lambda_reduce_body body(identity, real_body, reduction); + start_reduce,const simple_partitioner> + ::run( range, body, partitioner, context ); + return body.result(); +} + +//! Parallel iteration with reduction, auto_partitioner and user-supplied context +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, + const auto_partitioner& partitioner, task_group_context& context ) { + lambda_reduce_body body(identity, real_body, reduction); + start_reduce,const auto_partitioner> + ::run( range, body, partitioner, context ); + return body.result(); +} + +//! Parallel iteration with reduction, static_partitioner and user-supplied context +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, + const static_partitioner& partitioner, task_group_context& context ) { + lambda_reduce_body body(identity, real_body, reduction); + start_reduce,const static_partitioner> + ::run( range, body, partitioner, context ); + return body.result(); +} + +//! Parallel iteration with reduction, affinity_partitioner and user-supplied context +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, + affinity_partitioner& partitioner, task_group_context& context ) { + lambda_reduce_body body(identity, real_body, reduction); + start_reduce,affinity_partitioner> + ::run( range, body, partitioner, context ); + return body.result(); +} + +//! Parallel iteration with deterministic reduction and default simple partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_deterministic_reduce( const Range& range, Body& body ) { + start_deterministic_reduce::run(range, body, simple_partitioner()); +} + +//! Parallel iteration with deterministic reduction and simple partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) { + start_deterministic_reduce::run(range, body, partitioner); +} + +//! Parallel iteration with deterministic reduction and static partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) { + start_deterministic_reduce::run(range, body, partitioner); +} + +//! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_deterministic_reduce( const Range& range, Body& body, task_group_context& context ) { + start_deterministic_reduce::run( range, body, simple_partitioner(), context ); +} + +//! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) { + start_deterministic_reduce::run(range, body, partitioner, context); +} + +//! Parallel iteration with deterministic reduction, static partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_body) +void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) { + start_deterministic_reduce::run(range, body, partitioner, context); +} + +/** parallel_reduce overloads that work with anonymous function objects + (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/ + +//! Parallel iteration with deterministic reduction and default simple partitioner. +// TODO: consider making static_partitioner the default +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) { + return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner()); +} + +//! Parallel iteration with deterministic reduction and simple partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const simple_partitioner& partitioner ) { + lambda_reduce_body body(identity, real_body, reduction); + start_deterministic_reduce, const simple_partitioner> + ::run(range, body, partitioner); + return body.result(); +} + +//! Parallel iteration with deterministic reduction and static partitioner. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const static_partitioner& partitioner ) { + lambda_reduce_body body(identity, real_body, reduction); + start_deterministic_reduce, const static_partitioner> + ::run(range, body, partitioner); + return body.result(); +} + +//! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, + task_group_context& context ) { + return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner(), context); +} + +//! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, + const simple_partitioner& partitioner, task_group_context& context ) { + lambda_reduce_body body(identity, real_body, reduction); + start_deterministic_reduce, const simple_partitioner> + ::run(range, body, partitioner, context); + return body.result(); +} + +//! Parallel iteration with deterministic reduction, static partitioner and user-supplied context. +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_reduce_function && + parallel_reduce_combine) +Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, + const static_partitioner& partitioner, task_group_context& context ) { + lambda_reduce_body body(identity, real_body, reduction); + start_deterministic_reduce, const static_partitioner> + ::run(range, body, partitioner, context); + return body.result(); +} +//@} + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::parallel_reduce; +using detail::d1::parallel_deterministic_reduce; +// Split types +using detail::split; +using detail::proportional_split; +} // namespace v1 + +} // namespace tbb +#endif /* __TBB_parallel_reduce_H */ diff --git a/third_party/tbb/parallel_scan.h b/third_party/tbb/parallel_scan.h new file mode 100644 index 000000000..dba033af8 --- /dev/null +++ b/third_party/tbb/parallel_scan.h @@ -0,0 +1,631 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_parallel_scan_H +#define __TBB_parallel_scan_H + +#include "third_party/libcxx/functional" + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_task.h" + +#include "third_party/tbb/profiling.h" +#include "third_party/tbb/partitioner.h" +#include "third_party/tbb/blocked_range.h" +#include "third_party/tbb/task_group.h" + +namespace tbb { +namespace detail { +namespace d1 { + +//! Used to indicate that the initial scan is being performed. +/** @ingroup algorithms */ +struct pre_scan_tag { + static bool is_final_scan() {return false;} + operator bool() {return is_final_scan();} +}; + +//! Used to indicate that the final scan is being performed. +/** @ingroup algorithms */ +struct final_scan_tag { + static bool is_final_scan() {return true;} + operator bool() {return is_final_scan();} +}; + +template +struct sum_node; + +#if __TBB_CPP20_CONCEPTS_PRESENT +} // namespace d1 +namespace d0 { + +template +concept parallel_scan_body = splittable && + requires( Body& body, const Range& range, Body& other ) { + body(range, tbb::detail::d1::pre_scan_tag{}); + body(range, tbb::detail::d1::final_scan_tag{}); + body.reverse_join(other); + body.assign(other); + }; + +template +concept parallel_scan_function = std::invocable&, + const Range&, const Value&, bool> && + std::convertible_to&, + const Range&, const Value&, bool>, + Value>; + +template +concept parallel_scan_combine = std::invocable&, + const Value&, const Value&> && + std::convertible_to&, + const Value&, const Value&>, + Value>; + +} // namespace d0 +namespace d1 { +#endif // __TBB_CPP20_CONCEPTS_PRESENT + +//! Performs final scan for a leaf +/** @ingroup algorithms */ +template +struct final_sum : public task { +private: + using sum_node_type = sum_node; + Body m_body; + aligned_space m_range; + //! Where to put result of last subrange, or nullptr if not last subrange. + Body* m_stuff_last; + + wait_context& m_wait_context; + sum_node_type* m_parent = nullptr; +public: + small_object_allocator m_allocator; + final_sum( Body& body, wait_context& w_o, small_object_allocator& alloc ) : + m_body(body, split()), m_wait_context(w_o), m_allocator(alloc) { + poison_pointer(m_stuff_last); + } + + final_sum( final_sum& sum, small_object_allocator& alloc ) : + m_body(sum.m_body, split()), m_wait_context(sum.m_wait_context), m_allocator(alloc) { + poison_pointer(m_stuff_last); + } + + ~final_sum() { + m_range.begin()->~Range(); + } + void finish_construction( sum_node_type* parent, const Range& range, Body* stuff_last ) { + __TBB_ASSERT( m_parent == nullptr, nullptr ); + m_parent = parent; + new( m_range.begin() ) Range(range); + m_stuff_last = stuff_last; + } +private: + sum_node_type* release_parent() { + call_itt_task_notify(releasing, m_parent); + if (m_parent) { + auto parent = m_parent; + m_parent = nullptr; + if (parent->ref_count.fetch_sub(1) == 1) { + return parent; + } + } + else + m_wait_context.release(); + return nullptr; + } + sum_node_type* finalize(const execution_data& ed){ + sum_node_type* next_task = release_parent(); + m_allocator.delete_object(this, ed); + return next_task; + } + +public: + task* execute(execution_data& ed) override { + m_body( *m_range.begin(), final_scan_tag() ); + if( m_stuff_last ) + m_stuff_last->assign(m_body); + + return finalize(ed); + } + task* cancel(execution_data& ed) override { + return finalize(ed); + } + template + void operator()( const Range& r, Tag tag ) { + m_body( r, tag ); + } + void reverse_join( final_sum& a ) { + m_body.reverse_join(a.m_body); + } + void reverse_join( Body& body ) { + m_body.reverse_join(body); + } + void assign_to( Body& body ) { + body.assign(m_body); + } + void self_destroy(const execution_data& ed) { + m_allocator.delete_object(this, ed); + } +}; + +//! Split work to be done in the scan. +/** @ingroup algorithms */ +template +struct sum_node : public task { +private: + using final_sum_type = final_sum; +public: + final_sum_type *m_incoming; + final_sum_type *m_body; + Body *m_stuff_last; +private: + final_sum_type *m_left_sum; + sum_node *m_left; + sum_node *m_right; + bool m_left_is_final; + Range m_range; + wait_context& m_wait_context; + sum_node* m_parent; + small_object_allocator m_allocator; +public: + std::atomic ref_count{0}; + sum_node( const Range range, bool left_is_final_, sum_node* parent, wait_context& w_o, small_object_allocator& alloc ) : + m_stuff_last(nullptr), + m_left_sum(nullptr), + m_left(nullptr), + m_right(nullptr), + m_left_is_final(left_is_final_), + m_range(range), + m_wait_context(w_o), + m_parent(parent), + m_allocator(alloc) + { + if( m_parent ) + m_parent->ref_count.fetch_add(1); + // Poison fields that will be set by second pass. + poison_pointer(m_body); + poison_pointer(m_incoming); + } + + ~sum_node() { + if (m_parent) + m_parent->ref_count.fetch_sub(1); + } +private: + sum_node* release_parent() { + call_itt_task_notify(releasing, m_parent); + if (m_parent) { + auto parent = m_parent; + m_parent = nullptr; + if (parent->ref_count.fetch_sub(1) == 1) { + return parent; + } + } + else + m_wait_context.release(); + return nullptr; + } + task* create_child( const Range& range, final_sum_type& body, sum_node* child, final_sum_type* incoming, Body* stuff_last ) { + if( child ) { + __TBB_ASSERT( is_poisoned(child->m_body) && is_poisoned(child->m_incoming), nullptr ); + child->prepare_for_execution(body, incoming, stuff_last); + return child; + } else { + body.finish_construction(this, range, stuff_last); + return &body; + } + } + + sum_node* finalize(const execution_data& ed) { + sum_node* next_task = release_parent(); + m_allocator.delete_object(this, ed); + return next_task; + } + +public: + void prepare_for_execution(final_sum_type& body, final_sum_type* incoming, Body *stuff_last) { + this->m_body = &body; + this->m_incoming = incoming; + this->m_stuff_last = stuff_last; + } + task* execute(execution_data& ed) override { + if( m_body ) { + if( m_incoming ) + m_left_sum->reverse_join( *m_incoming ); + task* right_child = this->create_child(Range(m_range,split()), *m_left_sum, m_right, m_left_sum, m_stuff_last); + task* left_child = m_left_is_final ? nullptr : this->create_child(m_range, *m_body, m_left, m_incoming, nullptr); + ref_count = (left_child != nullptr) + (right_child != nullptr); + m_body = nullptr; + if( left_child ) { + spawn(*right_child, *ed.context); + return left_child; + } else { + return right_child; + } + } else { + return finalize(ed); + } + } + task* cancel(execution_data& ed) override { + return finalize(ed); + } + void self_destroy(const execution_data& ed) { + m_allocator.delete_object(this, ed); + } + template + friend struct start_scan; + + template + friend struct finish_scan; +}; + +//! Combine partial results +/** @ingroup algorithms */ +template +struct finish_scan : public task { +private: + using sum_node_type = sum_node; + using final_sum_type = final_sum; + final_sum_type** const m_sum_slot; + sum_node_type*& m_return_slot; + small_object_allocator m_allocator; +public: + std::atomic m_right_zombie; + sum_node_type& m_result; + std::atomic ref_count{2}; + finish_scan* m_parent; + wait_context& m_wait_context; + task* execute(execution_data& ed) override { + __TBB_ASSERT( m_result.ref_count.load() == static_cast((m_result.m_left!=nullptr)+(m_result.m_right!=nullptr)), nullptr ); + if( m_result.m_left ) + m_result.m_left_is_final = false; + final_sum_type* right_zombie = m_right_zombie.load(std::memory_order_acquire); + if( right_zombie && m_sum_slot ) + (*m_sum_slot)->reverse_join(*m_result.m_left_sum); + __TBB_ASSERT( !m_return_slot, nullptr ); + if( right_zombie || m_result.m_right ) { + m_return_slot = &m_result; + } else { + m_result.self_destroy(ed); + } + if( right_zombie && !m_sum_slot && !m_result.m_right ) { + right_zombie->self_destroy(ed); + m_right_zombie.store(nullptr, std::memory_order_relaxed); + } + return finalize(ed); + } + task* cancel(execution_data& ed) override { + return finalize(ed); + } + finish_scan(sum_node_type*& return_slot, final_sum_type** sum, sum_node_type& result_, finish_scan* parent, wait_context& w_o, small_object_allocator& alloc) : + m_sum_slot(sum), + m_return_slot(return_slot), + m_allocator(alloc), + m_right_zombie(nullptr), + m_result(result_), + m_parent(parent), + m_wait_context(w_o) + { + __TBB_ASSERT( !m_return_slot, nullptr ); + } +private: + finish_scan* release_parent() { + call_itt_task_notify(releasing, m_parent); + if (m_parent) { + auto parent = m_parent; + m_parent = nullptr; + if (parent->ref_count.fetch_sub(1) == 1) { + return parent; + } + } + else + m_wait_context.release(); + return nullptr; + } + finish_scan* finalize(const execution_data& ed) { + finish_scan* next_task = release_parent(); + m_allocator.delete_object(this, ed); + return next_task; + } +}; + +//! Initial task to split the work +/** @ingroup algorithms */ +template +struct start_scan : public task { +private: + using sum_node_type = sum_node; + using final_sum_type = final_sum; + using finish_pass1_type = finish_scan; + std::reference_wrapper m_return_slot; + Range m_range; + std::reference_wrapper m_body; + typename Partitioner::partition_type m_partition; + /** Non-null if caller is requesting total. */ + final_sum_type** m_sum_slot; + bool m_is_final; + bool m_is_right_child; + + finish_pass1_type* m_parent; + small_object_allocator m_allocator; + wait_context& m_wait_context; + + finish_pass1_type* release_parent() { + call_itt_task_notify(releasing, m_parent); + if (m_parent) { + auto parent = m_parent; + m_parent = nullptr; + if (parent->ref_count.fetch_sub(1) == 1) { + return parent; + } + } + else + m_wait_context.release(); + return nullptr; + } + + finish_pass1_type* finalize( const execution_data& ed ) { + finish_pass1_type* next_task = release_parent(); + m_allocator.delete_object(this, ed); + return next_task; + } + +public: + task* execute( execution_data& ) override; + task* cancel( execution_data& ed ) override { + return finalize(ed); + } + start_scan( sum_node_type*& return_slot, start_scan& parent, small_object_allocator& alloc ) : + m_return_slot(return_slot), + m_range(parent.m_range,split()), + m_body(parent.m_body), + m_partition(parent.m_partition,split()), + m_sum_slot(parent.m_sum_slot), + m_is_final(parent.m_is_final), + m_is_right_child(true), + m_parent(parent.m_parent), + m_allocator(alloc), + m_wait_context(parent.m_wait_context) + { + __TBB_ASSERT( !m_return_slot, nullptr ); + parent.m_is_right_child = false; + } + + start_scan( sum_node_type*& return_slot, const Range& range, final_sum_type& body, const Partitioner& partitioner, wait_context& w_o, small_object_allocator& alloc ) : + m_return_slot(return_slot), + m_range(range), + m_body(body), + m_partition(partitioner), + m_sum_slot(nullptr), + m_is_final(true), + m_is_right_child(false), + m_parent(nullptr), + m_allocator(alloc), + m_wait_context(w_o) + { + __TBB_ASSERT( !m_return_slot, nullptr ); + } + + static void run( const Range& range, Body& body, const Partitioner& partitioner ) { + if( !range.empty() ) { + task_group_context context(PARALLEL_SCAN); + + using start_pass1_type = start_scan; + sum_node_type* root = nullptr; + wait_context w_ctx{1}; + small_object_allocator alloc{}; + + auto& temp_body = *alloc.new_object(body, w_ctx, alloc); + temp_body.reverse_join(body); + + auto& pass1 = *alloc.new_object(/*m_return_slot=*/root, range, temp_body, partitioner, w_ctx, alloc); + + execute_and_wait(pass1, context, w_ctx, context); + if( root ) { + root->prepare_for_execution(temp_body, nullptr, &body); + w_ctx.reserve(); + execute_and_wait(*root, context, w_ctx, context); + } else { + temp_body.assign_to(body); + temp_body.finish_construction(nullptr, range, nullptr); + alloc.delete_object(&temp_body); + } + } + } +}; + +template +task* start_scan::execute( execution_data& ed ) { + // Inspecting m_parent->result.left_sum would ordinarily be a race condition. + // But we inspect it only if we are not a stolen task, in which case we + // know that task assigning to m_parent->result.left_sum has completed. + __TBB_ASSERT(!m_is_right_child || m_parent, "right child is never an orphan"); + bool treat_as_stolen = m_is_right_child && (is_stolen(ed) || &m_body.get()!=m_parent->m_result.m_left_sum); + if( treat_as_stolen ) { + // Invocation is for right child that has been really stolen or needs to be virtually stolen + small_object_allocator alloc{}; + final_sum_type* right_zombie = alloc.new_object(m_body, alloc); + m_parent->m_right_zombie.store(right_zombie, std::memory_order_release); + m_body = *right_zombie; + m_is_final = false; + } + task* next_task = nullptr; + if( (m_is_right_child && !treat_as_stolen) || !m_range.is_divisible() || m_partition.should_execute_range(ed) ) { + if( m_is_final ) + m_body(m_range, final_scan_tag()); + else if( m_sum_slot ) + m_body(m_range, pre_scan_tag()); + if( m_sum_slot ) + *m_sum_slot = &m_body.get(); + __TBB_ASSERT( !m_return_slot, nullptr ); + + next_task = finalize(ed); + } else { + small_object_allocator alloc{}; + auto result = alloc.new_object(m_range,/*m_left_is_final=*/m_is_final, m_parent? &m_parent->m_result: nullptr, m_wait_context, alloc); + + auto new_parent = alloc.new_object(m_return_slot, m_sum_slot, *result, m_parent, m_wait_context, alloc); + m_parent = new_parent; + + // Split off right child + auto& right_child = *alloc.new_object(/*m_return_slot=*/result->m_right, *this, alloc); + + spawn(right_child, *ed.context); + + m_sum_slot = &result->m_left_sum; + m_return_slot = result->m_left; + + __TBB_ASSERT( !m_return_slot, nullptr ); + next_task = this; + } + return next_task; +} + +template +class lambda_scan_body { + Value m_sum_slot; + const Value& identity_element; + const Scan& m_scan; + const ReverseJoin& m_reverse_join; +public: + void operator=(const lambda_scan_body&) = delete; + lambda_scan_body(const lambda_scan_body&) = default; + + lambda_scan_body( const Value& identity, const Scan& scan, const ReverseJoin& rev_join ) + : m_sum_slot(identity) + , identity_element(identity) + , m_scan(scan) + , m_reverse_join(rev_join) {} + + lambda_scan_body( lambda_scan_body& b, split ) + : m_sum_slot(b.identity_element) + , identity_element(b.identity_element) + , m_scan(b.m_scan) + , m_reverse_join(b.m_reverse_join) {} + + template + void operator()( const Range& r, Tag tag ) { + m_sum_slot = tbb::detail::invoke(m_scan, r, m_sum_slot, tag); + } + + void reverse_join( lambda_scan_body& a ) { + m_sum_slot = tbb::detail::invoke(m_reverse_join, a.m_sum_slot, m_sum_slot); + } + + void assign( lambda_scan_body& b ) { + m_sum_slot = b.m_sum_slot; + } + + Value result() const { + return m_sum_slot; + } +}; + +// Requirements on Range concept are documented in blocked_range.h + +/** \page parallel_scan_body_req Requirements on parallel_scan body + Class \c Body implementing the concept of parallel_scan body must define: + - \code Body::Body( Body&, split ); \endcode Splitting constructor. + Split \c b so that \c this and \c b can accumulate separately + - \code Body::~Body(); \endcode Destructor + - \code void Body::operator()( const Range& r, pre_scan_tag ); \endcode + Preprocess iterations for range \c r + - \code void Body::operator()( const Range& r, final_scan_tag ); \endcode + Do final processing for iterations of range \c r + - \code void Body::reverse_join( Body& a ); \endcode + Merge preprocessing state of \c a into \c this, where \c a was + created earlier from \c b by b's splitting constructor +**/ + +/** \name parallel_scan + See also requirements on \ref range_req "Range" and \ref parallel_scan_body_req "parallel_scan Body". **/ +//@{ + +//! Parallel prefix with default partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_scan_body) +void parallel_scan( const Range& range, Body& body ) { + start_scan::run(range,body,__TBB_DEFAULT_PARTITIONER()); +} + +//! Parallel prefix with simple_partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_scan_body) +void parallel_scan( const Range& range, Body& body, const simple_partitioner& partitioner ) { + start_scan::run(range, body, partitioner); +} + +//! Parallel prefix with auto_partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_scan_body) +void parallel_scan( const Range& range, Body& body, const auto_partitioner& partitioner ) { + start_scan::run(range, body, partitioner); +} + +//! Parallel prefix with default partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_scan_function && + parallel_scan_combine) +Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join ) { + lambda_scan_body body(identity, scan, reverse_join); + parallel_scan(range, body, __TBB_DEFAULT_PARTITIONER()); + return body.result(); +} + +//! Parallel prefix with simple_partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_scan_function && + parallel_scan_combine) +Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join, + const simple_partitioner& partitioner ) { + lambda_scan_body body(identity, scan, reverse_join); + parallel_scan(range, body, partitioner); + return body.result(); +} + +//! Parallel prefix with auto_partitioner +/** @ingroup algorithms **/ +template + __TBB_requires(tbb_range && parallel_scan_function && + parallel_scan_combine) +Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join, + const auto_partitioner& partitioner ) { + lambda_scan_body body(identity, scan, reverse_join); + parallel_scan(range, body, partitioner); + return body.result(); +} + +} // namespace d1 +} // namespace detail + +inline namespace v1 { + using detail::d1::parallel_scan; + using detail::d1::pre_scan_tag; + using detail::d1::final_scan_tag; +} // namespace v1 + +} // namespace tbb + +#endif /* __TBB_parallel_scan_H */ diff --git a/third_party/tbb/parallel_sort.h b/third_party/tbb/parallel_sort.h new file mode 100644 index 000000000..b089b9d99 --- /dev/null +++ b/third_party/tbb/parallel_sort.h @@ -0,0 +1,289 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_parallel_sort_H +#define __TBB_parallel_sort_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/parallel_for.h" +#include "third_party/tbb/blocked_range.h" +#include "third_party/tbb/profiling.h" + +#include "third_party/libcxx/algorithm" +#include "third_party/libcxx/iterator" +#include "third_party/libcxx/functional" +#include "third_party/libcxx/cstddef" + +namespace tbb { +namespace detail { +#if __TBB_CPP20_CONCEPTS_PRESENT +inline namespace d0 { + +// TODO: consider using std::strict_weak_order concept +template +concept compare = requires( const std::remove_reference_t& comp, typename std::iterator_traits::reference value ) { + // Forward via iterator_traits::reference + { comp(typename std::iterator_traits::reference(value), + typename std::iterator_traits::reference(value)) } -> std::convertible_to; +}; + +// Inspired by std::__PartiallyOrderedWith exposition only concept +template +concept less_than_comparable = requires( const std::remove_reference_t& lhs, + const std::remove_reference_t& rhs ) { + { lhs < rhs } -> boolean_testable; +}; + +} // namespace d0 +#endif // __TBB_CPP20_CONCEPTS_PRESENT +namespace d1 { + +//! Range used in quicksort to split elements into subranges based on a value. +/** The split operation selects a splitter and places all elements less than or equal + to the value in the first range and the remaining elements in the second range. + @ingroup algorithms */ +template +class quick_sort_range { + std::size_t median_of_three( const RandomAccessIterator& array, std::size_t l, std::size_t m, std::size_t r ) const { + return comp(array[l], array[m]) ? ( comp(array[m], array[r]) ? m : ( comp(array[l], array[r]) ? r : l ) ) + : ( comp(array[r], array[m]) ? m : ( comp(array[r], array[l]) ? r : l ) ); + } + + std::size_t pseudo_median_of_nine( const RandomAccessIterator& array, const quick_sort_range& range ) const { + std::size_t offset = range.size / 8u; + return median_of_three(array, + median_of_three(array, 0 , offset, offset * 2), + median_of_three(array, offset * 3, offset * 4, offset * 5), + median_of_three(array, offset * 6, offset * 7, range.size - 1)); + + } + + std::size_t split_range( quick_sort_range& range ) { + RandomAccessIterator array = range.begin; + RandomAccessIterator first_element = range.begin; + std::size_t m = pseudo_median_of_nine(array, range); + if( m != 0 ) std::iter_swap(array, array + m); + + std::size_t i = 0; + std::size_t j = range.size; + // Partition interval [i + 1,j - 1] with key *first_element. + for(;;) { + __TBB_ASSERT( i < j, nullptr ); + // Loop must terminate since array[l] == *first_element. + do { + --j; + __TBB_ASSERT( i <= j, "bad ordering relation?" ); + } while( comp(*first_element, array[j]) ); + do { + __TBB_ASSERT( i <= j, nullptr ); + if( i == j ) goto partition; + ++i; + } while( comp(array[i], *first_element) ); + if( i == j ) goto partition; + std::iter_swap(array + i, array + j); + } +partition: + // Put the partition key were it belongs + std::iter_swap(array + j, first_element); + // array[l..j) is less or equal to key. + // array(j..r) is greater or equal to key. + // array[j] is equal to key + i = j + 1; + std::size_t new_range_size = range.size - i; + range.size = j; + return new_range_size; + } + +public: + quick_sort_range() = default; + quick_sort_range( const quick_sort_range& ) = default; + void operator=( const quick_sort_range& ) = delete; + + static constexpr std::size_t grainsize = 500; + const Compare& comp; + std::size_t size; + RandomAccessIterator begin; + + quick_sort_range( RandomAccessIterator begin_, std::size_t size_, const Compare& comp_ ) : + comp(comp_), size(size_), begin(begin_) {} + + bool empty() const { return size == 0; } + bool is_divisible() const { return size >= grainsize; } + + quick_sort_range( quick_sort_range& range, split ) + : comp(range.comp) + , size(split_range(range)) + // +1 accounts for the pivot element, which is at its correct place + // already and, therefore, is not included into subranges. + , begin(range.begin + range.size + 1) {} +}; + +//! Body class used to test if elements in a range are presorted +/** @ingroup algorithms */ +template +class quick_sort_pretest_body { + const Compare& comp; + task_group_context& context; + +public: + quick_sort_pretest_body() = default; + quick_sort_pretest_body( const quick_sort_pretest_body& ) = default; + void operator=( const quick_sort_pretest_body& ) = delete; + + quick_sort_pretest_body( const Compare& _comp, task_group_context& _context ) : comp(_comp), context(_context) {} + + void operator()( const blocked_range& range ) const { + RandomAccessIterator my_end = range.end(); + + int i = 0; + //TODO: consider using std::is_sorted() for each 64 iterations (requires performance measurements) + for( RandomAccessIterator k = range.begin(); k != my_end; ++k, ++i ) { + if( i % 64 == 0 && context.is_group_execution_cancelled() ) break; + + // The k - 1 is never out-of-range because the first chunk starts at begin+serial_cutoff+1 + if( comp(*(k), *(k - 1)) ) { + context.cancel_group_execution(); + break; + } + } + } +}; + +//! Body class used to sort elements in a range that is smaller than the grainsize. +/** @ingroup algorithms */ +template +struct quick_sort_body { + void operator()( const quick_sort_range& range ) const { + std::sort(range.begin, range.begin + range.size, range.comp); + } +}; + +//! Method to perform parallel_for based quick sort. +/** @ingroup algorithms */ +template +void do_parallel_quick_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) { + parallel_for(quick_sort_range(begin, end - begin, comp), + quick_sort_body(), + auto_partitioner()); +} + +//! Wrapper method to initiate the sort by calling parallel_for. +/** @ingroup algorithms */ +template +void parallel_quick_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) { + task_group_context my_context(PARALLEL_SORT); + constexpr int serial_cutoff = 9; + + __TBB_ASSERT( begin + serial_cutoff < end, "min_parallel_size is smaller than serial cutoff?" ); + RandomAccessIterator k = begin; + for( ; k != begin + serial_cutoff; ++k ) { + if( comp(*(k + 1), *k) ) { + do_parallel_quick_sort(begin, end, comp); + return; + } + } + + // Check is input range already sorted + parallel_for(blocked_range(k + 1, end), + quick_sort_pretest_body(comp, my_context), + auto_partitioner(), + my_context); + + if( my_context.is_group_execution_cancelled() ) + do_parallel_quick_sort(begin, end, comp); +} + +/** \page parallel_sort_iter_req Requirements on iterators for parallel_sort + Requirements on the iterator type \c It and its value type \c T for \c parallel_sort: + + - \code void iter_swap( It a, It b ) \endcode Swaps the values of the elements the given + iterators \c a and \c b are pointing to. \c It should be a random access iterator. + + - \code bool Compare::operator()( const T& x, const T& y ) \endcode True if x comes before y; +**/ + +/** \name parallel_sort + See also requirements on \ref parallel_sort_iter_req "iterators for parallel_sort". **/ +//@{ + +#if __TBB_CPP20_CONCEPTS_PRESENT +template +using iter_value_type = typename std::iterator_traits::value_type; + +template +using range_value_type = typename std::iterator_traits>::value_type; +#endif + +//! Sorts the data in [begin,end) using the given comparator +/** The compare function object is used for all comparisons between elements during sorting. + The compare object must define a bool operator() function. + @ingroup algorithms **/ +template + __TBB_requires(std::random_access_iterator && + compare && + std::movable>) +void parallel_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) { + constexpr int min_parallel_size = 500; + if( end > begin ) { + if( end - begin < min_parallel_size ) { + std::sort(begin, end, comp); + } else { + parallel_quick_sort(begin, end, comp); + } + } +} + +//! Sorts the data in [begin,end) with a default comparator \c std::less +/** @ingroup algorithms **/ +template + __TBB_requires(std::random_access_iterator && + less_than_comparable> && + std::movable>) +void parallel_sort( RandomAccessIterator begin, RandomAccessIterator end ) { + parallel_sort(begin, end, std::less::value_type>()); +} + +//! Sorts the data in rng using the given comparator +/** @ingroup algorithms **/ +template + __TBB_requires(container_based_sequence && + compare> && + std::movable>) +void parallel_sort( Range&& rng, const Compare& comp ) { + parallel_sort(std::begin(rng), std::end(rng), comp); +} + +//! Sorts the data in rng with a default comparator \c std::less +/** @ingroup algorithms **/ +template + __TBB_requires(container_based_sequence && + less_than_comparable> && + std::movable>) +void parallel_sort( Range&& rng ) { + parallel_sort(std::begin(rng), std::end(rng)); +} +//@} + +} // namespace d1 +} // namespace detail + +inline namespace v1 { + using detail::d1::parallel_sort; +} // namespace v1 +} // namespace tbb + +#endif /*__TBB_parallel_sort_H*/ diff --git a/third_party/tbb/partitioner.h b/third_party/tbb/partitioner.h new file mode 100644 index 000000000..25a300028 --- /dev/null +++ b/third_party/tbb/partitioner.h @@ -0,0 +1,682 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_partitioner_H +#define __TBB_partitioner_H + +#ifndef __TBB_INITIAL_CHUNKS +// initial task divisions per thread +#define __TBB_INITIAL_CHUNKS 2 +#endif +#ifndef __TBB_RANGE_POOL_CAPACITY +// maximum number of elements in range pool +#define __TBB_RANGE_POOL_CAPACITY 8 +#endif +#ifndef __TBB_INIT_DEPTH +// initial value for depth of range pool +#define __TBB_INIT_DEPTH 5 +#endif +#ifndef __TBB_DEMAND_DEPTH_ADD +// when imbalance is found range splits this value times more +#define __TBB_DEMAND_DEPTH_ADD 1 +#endif + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_aligned_space.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_range_common.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/detail/_small_object_pool.h" + +#include "third_party/tbb/cache_aligned_allocator.h" +#include "third_party/tbb/task_group.h" // task_group_context +#include "third_party/tbb/task_arena.h" + +#include "third_party/libcxx/algorithm" +#include "third_party/libcxx/atomic" +#include "third_party/libcxx/type_traits" + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) + // Workaround for overzealous compiler warnings + #pragma warning (push) + #pragma warning (disable: 4244) +#endif + +namespace tbb { +namespace detail { + +namespace d1 { +class auto_partitioner; +class simple_partitioner; +class static_partitioner; +class affinity_partitioner; +class affinity_partition_type; +class affinity_partitioner_base; + +inline std::size_t get_initial_auto_partitioner_divisor() { + const std::size_t factor = 4; + return factor * static_cast(max_concurrency()); +} + +//! Defines entry point for affinity partitioner into oneTBB run-time library. +class affinity_partitioner_base: no_copy { + friend class affinity_partitioner; + friend class affinity_partition_type; + //! Array that remembers affinities of tree positions to affinity_id. + /** nullptr if my_size==0. */ + slot_id* my_array; + //! Number of elements in my_array. + std::size_t my_size; + //! Zeros the fields. + affinity_partitioner_base() : my_array(nullptr), my_size(0) {} + //! Deallocates my_array. + ~affinity_partitioner_base() { resize(0); } + //! Resize my_array. + /** Retains values if resulting size is the same. */ + void resize(unsigned factor) { + // Check factor to avoid asking for number of workers while there might be no arena. + unsigned max_threads_in_arena = static_cast(max_concurrency()); + std::size_t new_size = factor ? factor * max_threads_in_arena : 0; + if (new_size != my_size) { + if (my_array) { + r1::cache_aligned_deallocate(my_array); + // Following two assignments must be done here for sake of exception safety. + my_array = nullptr; + my_size = 0; + } + if (new_size) { + my_array = static_cast(r1::cache_aligned_allocate(new_size * sizeof(slot_id))); + std::fill_n(my_array, new_size, no_slot); + my_size = new_size; + } + } + } +}; + +template struct start_for; +template struct start_scan; +template struct start_reduce; +template struct start_deterministic_reduce; + +struct node { + node* my_parent{}; + std::atomic m_ref_count{}; + + node() = default; + node(node* parent, int ref_count) : + my_parent{parent}, m_ref_count{ref_count} { + __TBB_ASSERT(ref_count > 0, "The ref count must be positive"); + } +}; + +struct wait_node : node { + wait_node() : node{ nullptr, 1 } {} + wait_context m_wait{1}; +}; + +//! Join task node that contains shared flag for stealing feedback +struct tree_node : public node { + small_object_allocator m_allocator; + std::atomic m_child_stolen{false}; + + tree_node(node* parent, int ref_count, small_object_allocator& alloc) + : node{parent, ref_count} + , m_allocator{alloc} {} + + void join(task_group_context*) {/*dummy, required only for reduction algorithms*/}; + + template + static void mark_task_stolen(Task &t) { + std::atomic &flag = static_cast(t.my_parent)->m_child_stolen; +#if TBB_USE_PROFILING_TOOLS + // Threading tools respect lock prefix but report false-positive data-race via plain store + flag.exchange(true); +#else + flag.store(true, std::memory_order_relaxed); +#endif // TBB_USE_PROFILING_TOOLS + } + template + static bool is_peer_stolen(Task &t) { + return static_cast(t.my_parent)->m_child_stolen.load(std::memory_order_relaxed); + } +}; + +// Context used to check cancellation state during reduction join process +template +void fold_tree(node* n, const execution_data& ed) { + for (;;) { + __TBB_ASSERT(n, nullptr); + __TBB_ASSERT(n->m_ref_count.load(std::memory_order_relaxed) > 0, "The refcount must be positive."); + call_itt_task_notify(releasing, n); + if (--n->m_ref_count > 0) { + return; + } + node* parent = n->my_parent; + if (!parent) { + break; + }; + + call_itt_task_notify(acquired, n); + TreeNodeType* self = static_cast(n); + self->join(ed.context); + self->m_allocator.delete_object(self, ed); + n = parent; + } + // Finish parallel for execution when the root (last node) is reached + static_cast(n)->m_wait.release(); +} + +//! Depth is a relative depth of recursive division inside a range pool. Relative depth allows +//! infinite absolute depth of the recursion for heavily unbalanced workloads with range represented +//! by a number that cannot fit into machine word. +typedef unsigned char depth_t; + +//! Range pool stores ranges of type T in a circular buffer with MaxCapacity +template +class range_vector { + depth_t my_head; + depth_t my_tail; + depth_t my_size; + depth_t my_depth[MaxCapacity]; // relative depths of stored ranges + tbb::detail::aligned_space my_pool; + +public: + //! initialize via first range in pool + range_vector(const T& elem) : my_head(0), my_tail(0), my_size(1) { + my_depth[0] = 0; + new( static_cast(my_pool.begin()) ) T(elem);//TODO: std::move? + } + ~range_vector() { + while( !empty() ) pop_back(); + } + bool empty() const { return my_size == 0; } + depth_t size() const { return my_size; } + //! Populates range pool via ranges up to max depth or while divisible + //! max_depth starts from 0, e.g. value 2 makes 3 ranges in the pool up to two 1/4 pieces + void split_to_fill(depth_t max_depth) { + while( my_size < MaxCapacity && is_divisible(max_depth) ) { + depth_t prev = my_head; + my_head = (my_head + 1) % MaxCapacity; + new(my_pool.begin()+my_head) T(my_pool.begin()[prev]); // copy TODO: std::move? + my_pool.begin()[prev].~T(); // instead of assignment + new(my_pool.begin()+prev) T(my_pool.begin()[my_head], detail::split()); // do 'inverse' split + my_depth[my_head] = ++my_depth[prev]; + my_size++; + } + } + void pop_back() { + __TBB_ASSERT(my_size > 0, "range_vector::pop_back() with empty size"); + my_pool.begin()[my_head].~T(); + my_size--; + my_head = (my_head + MaxCapacity - 1) % MaxCapacity; + } + void pop_front() { + __TBB_ASSERT(my_size > 0, "range_vector::pop_front() with empty size"); + my_pool.begin()[my_tail].~T(); + my_size--; + my_tail = (my_tail + 1) % MaxCapacity; + } + T& back() { + __TBB_ASSERT(my_size > 0, "range_vector::back() with empty size"); + return my_pool.begin()[my_head]; + } + T& front() { + __TBB_ASSERT(my_size > 0, "range_vector::front() with empty size"); + return my_pool.begin()[my_tail]; + } + //! similarly to front(), returns depth of the first range in the pool + depth_t front_depth() { + __TBB_ASSERT(my_size > 0, "range_vector::front_depth() with empty size"); + return my_depth[my_tail]; + } + depth_t back_depth() { + __TBB_ASSERT(my_size > 0, "range_vector::back_depth() with empty size"); + return my_depth[my_head]; + } + bool is_divisible(depth_t max_depth) { + return back_depth() < max_depth && back().is_divisible(); + } +}; + +//! Provides default methods for partition objects and common algorithm blocks. +template +struct partition_type_base { + typedef detail::split split_type; + // decision makers + void note_affinity( slot_id ) {} + template + bool check_being_stolen(Task&, const execution_data&) { return false; } // part of old should_execute_range() + template split_type get_split() { return split(); } + Partition& self() { return *static_cast(this); } // CRTP helper + + template + void work_balance(StartType &start, Range &range, const execution_data&) { + start.run_body( range ); // static partitioner goes here + } + + template + void execute(StartType &start, Range &range, execution_data& ed) { + // The algorithm in a few words ([]-denotes calls to decision methods of partitioner): + // [If this task is stolen, adjust depth and divisions if necessary, set flag]. + // If range is divisible { + // Spread the work while [initial divisions left]; + // Create trap task [if necessary]; + // } + // If not divisible or [max depth is reached], execute, else do the range pool part + if ( range.is_divisible() ) { + if ( self().is_divisible() ) { + do { // split until is divisible + typename Partition::split_type split_obj = self().template get_split(); + start.offer_work( split_obj, ed ); + } while ( range.is_divisible() && self().is_divisible() ); + } + } + self().work_balance(start, range, ed); + } +}; + +//! Provides default splitting strategy for partition objects. +template +struct adaptive_mode : partition_type_base { + typedef Partition my_partition; + std::size_t my_divisor; + // For affinity_partitioner, my_divisor indicates the number of affinity array indices the task reserves. + // A task which has only one index must produce the right split without reserved index in order to avoid + // it to be overwritten in note_affinity() of the created (right) task. + // I.e. a task created deeper than the affinity array can remember must not save its affinity (LIFO order) + static const unsigned factor = 1; + adaptive_mode() : my_divisor(get_initial_auto_partitioner_divisor() / 4 * my_partition::factor) {} + adaptive_mode(adaptive_mode &src, split) : my_divisor(do_split(src, split())) {} + adaptive_mode(adaptive_mode&, const proportional_split&) : my_divisor(0) + { + // left blank as my_divisor gets overridden in the successors' constructors + } + /*! Override do_split methods in order to specify splitting strategy */ + std::size_t do_split(adaptive_mode &src, split) { + return src.my_divisor /= 2u; + } +}; + + +//! Provides proportional splitting strategy for partition objects +template +struct proportional_mode : adaptive_mode { + typedef Partition my_partition; + using partition_type_base::self; // CRTP helper to get access to derived classes + + proportional_mode() : adaptive_mode() {} + proportional_mode(proportional_mode &src, split) : adaptive_mode(src, split()) {} + proportional_mode(proportional_mode &src, const proportional_split& split_obj) + : adaptive_mode(src, split_obj) + { + self().my_divisor = do_split(src, split_obj); + } + std::size_t do_split(proportional_mode &src, const proportional_split& split_obj) { + std::size_t portion = split_obj.right() * my_partition::factor; + portion = (portion + my_partition::factor/2) & (0ul - my_partition::factor); + src.my_divisor -= portion; + return portion; + } + bool is_divisible() { // part of old should_execute_range() + return self().my_divisor > my_partition::factor; + } + template + proportional_split get_split() { + // Create the proportion from partitioner internal resources (threads) that would be used: + // - into proportional_mode constructor to split the partitioner + // - if Range supports the proportional_split constructor it would use proposed proportion, + // otherwise, the tbb::proportional_split object will be implicitly (for Range implementor) + // casted to tbb::split + + std::size_t n = self().my_divisor / my_partition::factor; + std::size_t right = n / 2; + std::size_t left = n - right; + return proportional_split(left, right); + } +}; + +static std::size_t get_initial_partition_head() { + int current_index = tbb::this_task_arena::current_thread_index(); + if (current_index == tbb::task_arena::not_initialized) + current_index = 0; + return size_t(current_index); +} + +//! Provides default linear indexing of partitioner's sequence +template +struct linear_affinity_mode : proportional_mode { + std::size_t my_head; + std::size_t my_max_affinity; + using proportional_mode::self; + linear_affinity_mode() : proportional_mode(), my_head(get_initial_partition_head()), + my_max_affinity(self().my_divisor) {} + linear_affinity_mode(linear_affinity_mode &src, split) : proportional_mode(src, split()) + , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {} + linear_affinity_mode(linear_affinity_mode &src, const proportional_split& split_obj) : proportional_mode(src, split_obj) + , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {} + void spawn_task(task& t, task_group_context& ctx) { + if (self().my_divisor) { + spawn(t, ctx, slot_id(my_head)); + } else { + spawn(t, ctx); + } + } +}; + +static bool is_stolen_task(const execution_data& ed) { + return execution_slot(ed) != original_slot(ed); +} + +/*! Determine work-balance phase implementing splitting & stealing actions */ +template +struct dynamic_grainsize_mode : Mode { + using Mode::self; + enum { + begin = 0, + run, + pass + } my_delay; + depth_t my_max_depth; + static const unsigned range_pool_size = __TBB_RANGE_POOL_CAPACITY; + dynamic_grainsize_mode(): Mode() + , my_delay(begin) + , my_max_depth(__TBB_INIT_DEPTH) {} + dynamic_grainsize_mode(dynamic_grainsize_mode& p, split) + : Mode(p, split()) + , my_delay(pass) + , my_max_depth(p.my_max_depth) {} + dynamic_grainsize_mode(dynamic_grainsize_mode& p, const proportional_split& split_obj) + : Mode(p, split_obj) + , my_delay(begin) + , my_max_depth(p.my_max_depth) {} + template + bool check_being_stolen(Task &t, const execution_data& ed) { // part of old should_execute_range() + if( !(self().my_divisor / Mode::my_partition::factor) ) { // if not from the top P tasks of binary tree + self().my_divisor = 1; // TODO: replace by on-stack flag (partition_state's member)? + if( is_stolen_task(ed) && t.my_parent->m_ref_count >= 2 ) { // runs concurrently with the left task +#if __TBB_USE_OPTIONAL_RTTI + // RTTI is available, check whether the cast is valid + // TODO: TBB_REVAMP_TODO __TBB_ASSERT(dynamic_cast(t.m_parent), 0); + // correctness of the cast relies on avoiding the root task for which: + // - initial value of my_divisor != 0 (protected by separate assertion) + // - is_stolen_task() always returns false for the root task. +#endif + tree_node::mark_task_stolen(t); + if( !my_max_depth ) my_max_depth++; + my_max_depth += __TBB_DEMAND_DEPTH_ADD; + return true; + } + } + return false; + } + depth_t max_depth() { return my_max_depth; } + void align_depth(depth_t base) { + __TBB_ASSERT(base <= my_max_depth, nullptr); + my_max_depth -= base; + } + template + void work_balance(StartType &start, Range &range, execution_data& ed) { + if( !range.is_divisible() || !self().max_depth() ) { + start.run_body( range ); + } + else { // do range pool + range_vector range_pool(range); + do { + range_pool.split_to_fill(self().max_depth()); // fill range pool + if( self().check_for_demand( start ) ) { + if( range_pool.size() > 1 ) { + start.offer_work( range_pool.front(), range_pool.front_depth(), ed ); + range_pool.pop_front(); + continue; + } + if( range_pool.is_divisible(self().max_depth()) ) // was not enough depth to fork a task + continue; // note: next split_to_fill() should split range at least once + } + start.run_body( range_pool.back() ); + range_pool.pop_back(); + } while( !range_pool.empty() && !ed.context->is_group_execution_cancelled() ); + } + } + template + bool check_for_demand(Task& t) { + if ( pass == my_delay ) { + if ( self().my_divisor > 1 ) // produce affinitized tasks while they have slot in array + return true; // do not do my_max_depth++ here, but be sure range_pool is splittable once more + else if ( self().my_divisor && my_max_depth ) { // make balancing task + self().my_divisor = 0; // once for each task; depth will be decreased in align_depth() + return true; + } + else if ( tree_node::is_peer_stolen(t) ) { + my_max_depth += __TBB_DEMAND_DEPTH_ADD; + return true; + } + } else if( begin == my_delay ) { + my_delay = pass; + } + return false; + } +}; + +class auto_partition_type: public dynamic_grainsize_mode > { +public: + auto_partition_type( const auto_partitioner& ) { + my_divisor *= __TBB_INITIAL_CHUNKS; + } + auto_partition_type( auto_partition_type& src, split) + : dynamic_grainsize_mode >(src, split()) {} + bool is_divisible() { // part of old should_execute_range() + if( my_divisor > 1 ) return true; + if( my_divisor && my_max_depth ) { // can split the task. TODO: on-stack flag instead + // keep same fragmentation while splitting for the local task pool + my_max_depth--; + my_divisor = 0; // decrease max_depth once per task + return true; + } else return false; + } + template + bool check_for_demand(Task& t) { + if (tree_node::is_peer_stolen(t)) { + my_max_depth += __TBB_DEMAND_DEPTH_ADD; + return true; + } else return false; + } + void spawn_task(task& t, task_group_context& ctx) { + spawn(t, ctx); + } +}; + +class simple_partition_type: public partition_type_base { +public: + simple_partition_type( const simple_partitioner& ) {} + simple_partition_type( const simple_partition_type&, split ) {} + //! simplified algorithm + template + void execute(StartType &start, Range &range, execution_data& ed) { + split_type split_obj = split(); // start.offer_work accepts split_type as reference + while( range.is_divisible() ) + start.offer_work( split_obj, ed ); + start.run_body( range ); + } + void spawn_task(task& t, task_group_context& ctx) { + spawn(t, ctx); + } +}; + +class static_partition_type : public linear_affinity_mode { +public: + typedef detail::proportional_split split_type; + static_partition_type( const static_partitioner& ) {} + static_partition_type( static_partition_type& p, const proportional_split& split_obj ) + : linear_affinity_mode(p, split_obj) {} +}; + +class affinity_partition_type : public dynamic_grainsize_mode > { + static const unsigned factor_power = 4; // TODO: get a unified formula based on number of computing units + slot_id* my_array; +public: + static const unsigned factor = 1 << factor_power; // number of slots in affinity array per task + typedef detail::proportional_split split_type; + affinity_partition_type( affinity_partitioner_base& ap ) { + __TBB_ASSERT( (factor&(factor-1))==0, "factor must be power of two" ); + ap.resize(factor); + my_array = ap.my_array; + my_max_depth = factor_power + 1; + __TBB_ASSERT( my_max_depth < __TBB_RANGE_POOL_CAPACITY, nullptr ); + } + affinity_partition_type(affinity_partition_type& p, split) + : dynamic_grainsize_mode >(p, split()) + , my_array(p.my_array) {} + affinity_partition_type(affinity_partition_type& p, const proportional_split& split_obj) + : dynamic_grainsize_mode >(p, split_obj) + , my_array(p.my_array) {} + void note_affinity(slot_id id) { + if( my_divisor ) + my_array[my_head] = id; + } + void spawn_task(task& t, task_group_context& ctx) { + if (my_divisor) { + if (!my_array[my_head]) { + // TODO: consider new ideas with my_array for both affinity and static partitioner's, then code reuse + spawn(t, ctx, slot_id(my_head / factor)); + } else { + spawn(t, ctx, my_array[my_head]); + } + } else { + spawn(t, ctx); + } + } +}; + +//! A simple partitioner +/** Divides the range until the range is not divisible. + @ingroup algorithms */ +class simple_partitioner { +public: + simple_partitioner() {} +private: + template friend struct start_for; + template friend struct start_reduce; + template friend struct start_deterministic_reduce; + template friend struct start_scan; + // new implementation just extends existing interface + typedef simple_partition_type task_partition_type; + // TODO: consider to make split_type public + typedef simple_partition_type::split_type split_type; + + // for parallel_scan only + class partition_type { + public: + bool should_execute_range(const execution_data& ) {return false;} + partition_type( const simple_partitioner& ) {} + partition_type( const partition_type&, split ) {} + }; +}; + +//! An auto partitioner +/** The range is initial divided into several large chunks. + Chunks are further subdivided into smaller pieces if demand detected and they are divisible. + @ingroup algorithms */ +class auto_partitioner { +public: + auto_partitioner() {} + +private: + template friend struct start_for; + template friend struct start_reduce; + template friend struct start_deterministic_reduce; + template friend struct start_scan; + // new implementation just extends existing interface + typedef auto_partition_type task_partition_type; + // TODO: consider to make split_type public + typedef auto_partition_type::split_type split_type; + + //! Backward-compatible partition for auto and affinity partition objects. + class partition_type { + size_t num_chunks; + static const size_t VICTIM_CHUNKS = 4; + public: + bool should_execute_range(const execution_data& ed) { + if( num_chunks friend struct start_for; + template friend struct start_reduce; + template friend struct start_deterministic_reduce; + template friend struct start_scan; + // new implementation just extends existing interface + typedef static_partition_type task_partition_type; + // TODO: consider to make split_type public + typedef static_partition_type::split_type split_type; +}; + +//! An affinity partitioner +class affinity_partitioner : affinity_partitioner_base { +public: + affinity_partitioner() {} + +private: + template friend struct start_for; + template friend struct start_reduce; + template friend struct start_deterministic_reduce; + template friend struct start_scan; + // new implementation just extends existing interface + typedef affinity_partition_type task_partition_type; + // TODO: consider to make split_type public + typedef affinity_partition_type::split_type split_type; +}; + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +// Partitioners +using detail::d1::auto_partitioner; +using detail::d1::simple_partitioner; +using detail::d1::static_partitioner; +using detail::d1::affinity_partitioner; +// Split types +using detail::split; +using detail::proportional_split; +} // namespace v1 + +} // namespace tbb + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) + #pragma warning (pop) +#endif // warning 4244 is back + +#undef __TBB_INITIAL_CHUNKS +#undef __TBB_RANGE_POOL_CAPACITY +#undef __TBB_INIT_DEPTH + +#endif /* __TBB_partitioner_H */ diff --git a/third_party/tbb/permit_manager.h b/third_party/tbb/permit_manager.h new file mode 100644 index 000000000..80f32daf4 --- /dev/null +++ b/third_party/tbb/permit_manager.h @@ -0,0 +1,61 @@ +// clang-format off +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_permit_manager_H +#define _TBB_permit_manager_H + +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/thread_request_serializer.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class arena; +class pm_client; + +class permit_manager : no_copy { +public: + virtual ~permit_manager() {} + virtual pm_client* create_client(arena& a) = 0; + virtual void register_client(pm_client* client) = 0; + virtual void unregister_and_destroy_client(pm_client& c) = 0; + + virtual void set_active_num_workers(int soft_limit) = 0; + virtual void adjust_demand(pm_client&, int mandatory_delta, int workers_delta) = 0; + + void set_thread_request_observer(thread_request_observer& tr_observer) { + __TBB_ASSERT(!my_thread_request_observer, "set_thread_request_observer was called already?"); + my_thread_request_observer = &tr_observer; + } +protected: + void notify_thread_request(int delta) { + __TBB_ASSERT(my_thread_request_observer, "set_thread_request_observer was not called?"); + if (delta) { + my_thread_request_observer->update(delta); + } + } +private: + thread_request_observer* my_thread_request_observer{nullptr}; +}; + + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_permit_manager_H diff --git a/third_party/tbb/pm_client.h b/third_party/tbb/pm_client.h new file mode 100644 index 000000000..877e1d2a9 --- /dev/null +++ b/third_party/tbb/pm_client.h @@ -0,0 +1,71 @@ +// clang-format off +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_pm_client_H +#define _TBB_pm_client_H + +#include "third_party/tbb/arena.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class pm_client { +public: + pm_client(arena& a) : my_arena(a) {} + + unsigned priority_level() { + return my_arena.priority_level(); + } + + void set_top_priority(bool b) { + my_arena.set_top_priority(b); + } + + int min_workers() const { + return my_min_workers; + } + + int max_workers() const { + return my_max_workers; + } + + int update_request(int mandatory_delta, int workers_delta) { + auto min_max_workers = my_arena.update_request(mandatory_delta, workers_delta); + int delta = min_max_workers.second - my_max_workers; + set_workers(min_max_workers.first, min_max_workers.second); + return delta; + } + +protected: + void set_workers(int mn_w, int mx_w) { + __TBB_ASSERT(mn_w >= 0, nullptr); + __TBB_ASSERT(mx_w >= 0, nullptr); + my_min_workers = mn_w; + my_max_workers = mx_w; + } + + arena& my_arena; + int my_min_workers{0}; + int my_max_workers{0}; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_pm_client_H diff --git a/third_party/tbb/private_server.cpp b/third_party/tbb/private_server.cpp new file mode 100644 index 000000000..8b7a758bd --- /dev/null +++ b/third_party/tbb/private_server.cpp @@ -0,0 +1,437 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/cache_aligned_allocator.h" +#include "third_party/tbb/mutex.h" + +#include "third_party/tbb/rml_tbb.h" +#include "third_party/tbb/rml_thread_monitor.h" + +#include "third_party/tbb/scheduler_common.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/misc.h" + +#include "third_party/libcxx/atomic" + + +namespace tbb { +namespace detail { +namespace r1 { +namespace rml { + +using rml::internal::thread_monitor; +typedef thread_monitor::handle_type thread_handle; + +class private_server; + +class private_worker: no_copy { +private: + //! State in finite-state machine that controls the worker. + /** State diagram: + init --> starting --> normal + | | | + | V | + \------> quit <------/ + */ + enum state_t { + //! *this is initialized + st_init, + //! *this has associated thread that is starting up. + st_starting, + //! Associated thread is doing normal life sequence. + st_normal, + //! Associated thread has ended normal life sequence and promises to never touch *this again. + st_quit + }; + std::atomic my_state; + + //! Associated server + private_server& my_server; + + //! Associated client + tbb_client& my_client; + + //! index used for avoiding the 64K aliasing problem + const std::size_t my_index; + + //! Monitor for sleeping when there is no work to do. + /** The invariant that holds for sleeping workers is: + "my_slack<=0 && my_state==st_normal && I am on server's list of asleep threads" */ + thread_monitor my_thread_monitor; + + //! Handle of the OS thread associated with this worker + thread_handle my_handle; + + //! Link for list of workers that are sleeping or have no associated thread. + private_worker* my_next; + + friend class private_server; + + //! Actions executed by the associated thread + void run() noexcept; + + //! Wake up associated thread (or launch a thread if there is none) + void wake_or_launch(); + + //! Called by a thread (usually not the associated thread) to commence termination. + void start_shutdown(); + + static __RML_DECL_THREAD_ROUTINE thread_routine( void* arg ); + + static void release_handle(thread_handle my_handle, bool join); + +protected: + private_worker( private_server& server, tbb_client& client, const std::size_t i ) : + my_state(st_init), my_server(server), my_client(client), my_index(i), + my_handle(), my_next() + {} +}; + +static const std::size_t cache_line_size = tbb::detail::max_nfs_size; + +#if _MSC_VER && !defined(__INTEL_COMPILER) + // Suppress overzealous compiler warnings about uninstantiable class + #pragma warning(push) + #pragma warning(disable:4510 4610) +#endif +class padded_private_worker: public private_worker { + char pad[cache_line_size - sizeof(private_worker)%cache_line_size]; +public: + padded_private_worker( private_server& server, tbb_client& client, const std::size_t i ) + : private_worker(server,client,i) { suppress_unused_warning(pad); } +}; +#if _MSC_VER && !defined(__INTEL_COMPILER) + #pragma warning(pop) +#endif + +class private_server: public tbb_server, no_copy { +private: + tbb_client& my_client; + //! Maximum number of threads to be created. + /** Threads are created lazily, so maximum might not actually be reached. */ + const tbb_client::size_type my_n_thread; + + //! Stack size for each thread. */ + const std::size_t my_stack_size; + + //! Number of jobs that could use their associated thread minus number of active threads. + /** If negative, indicates oversubscription. + If positive, indicates that more threads should run. + Can be lowered asynchronously, but must be raised only while holding my_asleep_list_mutex, + because raising it impacts the invariant for sleeping threads. */ + std::atomic my_slack; + + //! Counter used to determine when to delete this. + std::atomic my_ref_count; + + padded_private_worker* my_thread_array; + + //! List of workers that are asleep or committed to sleeping until notified by another thread. + std::atomic my_asleep_list_root; + + //! Protects my_asleep_list_root + typedef mutex asleep_list_mutex_type; + asleep_list_mutex_type my_asleep_list_mutex; + +#if TBB_USE_ASSERT + std::atomic my_net_slack_requests; +#endif /* TBB_USE_ASSERT */ + + //! Wake up to two sleeping workers, if there are any sleeping. + /** The call is used to propagate a chain reaction where each thread wakes up two threads, + which in turn each wake up two threads, etc. */ + void propagate_chain_reaction() { + // First test of a double-check idiom. Second test is inside wake_some(0). + if( my_asleep_list_root.load(std::memory_order_relaxed) ) + wake_some(0); + } + + //! Try to add t to list of sleeping workers + bool try_insert_in_asleep_list( private_worker& t ); + + //! Equivalent of adding additional_slack to my_slack and waking up to 2 threads if my_slack permits. + void wake_some( int additional_slack ); + + ~private_server() override; + + void remove_server_ref() { + if( --my_ref_count==0 ) { + my_client.acknowledge_close_connection(); + this->~private_server(); + tbb::cache_aligned_allocator().deallocate( this, 1 ); + } + } + + friend class private_worker; +public: + private_server( tbb_client& client ); + + version_type version() const override { + return 0; + } + + void request_close_connection( bool /*exiting*/ ) override { + for( std::size_t i=0; i=2 && !__MINGW64__ +// ensure that stack is properly aligned for TBB threads +__attribute__((force_align_arg_pointer)) +#endif +__RML_DECL_THREAD_ROUTINE private_worker::thread_routine( void* arg ) { + private_worker* self = static_cast(arg); + AVOID_64K_ALIASING( self->my_index ); + self->run(); + // return 0 instead of nullptr due to the difference in the type __RML_DECL_THREAD_ROUTINE on various OSs + return 0; +} +#if _MSC_VER && !defined(__INTEL_COMPILER) + #pragma warning(pop) +#endif + +void private_worker::release_handle(thread_handle handle, bool join) { + if (join) + thread_monitor::join(handle); + else + thread_monitor::detach_thread(handle); +} + +void private_worker::start_shutdown() { + __TBB_ASSERT(my_state.load(std::memory_order_relaxed) != st_quit, "The quit state is expected to be set only once"); + + // `acq` to acquire my_handle + // `rel` to release market state + state_t prev_state = my_state.exchange(st_quit, std::memory_order_acq_rel); + + if (prev_state == st_init) { + // Perform action that otherwise would be performed by associated thread when it quits. + my_server.remove_server_ref(); + } else { + __TBB_ASSERT(prev_state == st_normal || prev_state == st_starting, nullptr); + // May have invalidated invariant for sleeping, so wake up the thread. + // Note that the notify() here occurs without maintaining invariants for my_slack. + // It does not matter, because my_state==st_quit overrides checking of my_slack. + my_thread_monitor.notify(); + // Do not need release handle in st_init state, + // because in this case the thread wasn't started yet. + // For st_starting release is done at launch site. + if (prev_state == st_normal) + release_handle(my_handle, governor::does_client_join_workers(my_client)); + } +} + +void private_worker::run() noexcept { + my_server.propagate_chain_reaction(); + + // Transiting to st_normal here would require setting my_handle, + // which would create race with the launching thread and + // complications in handle management on Windows. + + ::rml::job& j = *my_client.create_one_job(); + // memory_order_seq_cst to be strictly ordered after thread_monitor::wait on the next iteration + while( my_state.load(std::memory_order_seq_cst)!=st_quit ) { + if( my_server.my_slack.load(std::memory_order_acquire)>=0 ) { + my_client.process(j); + } else if( my_server.try_insert_in_asleep_list(*this) ) { + my_thread_monitor.wait(); + __TBB_ASSERT(my_state.load(std::memory_order_relaxed) == st_quit || !my_next, "Thread monitor missed a spurious wakeup?" ); + my_server.propagate_chain_reaction(); + } + } + my_client.cleanup(j); + + ++my_server.my_slack; + my_server.remove_server_ref(); +} + +inline void private_worker::wake_or_launch() { + state_t state = my_state.load(std::memory_order_relaxed); + + switch (state) { + case st_starting: + __TBB_fallthrough; + case st_normal: + __TBB_ASSERT(!my_next, "Should not wake a thread while it's still in asleep list"); + my_thread_monitor.notify(); + break; + case st_init: + if (my_state.compare_exchange_strong(state, st_starting)) { + // after this point, remove_server_ref() must be done by created thread +#if __TBB_USE_WINAPI + // Win thread_monitor::launch is designed on the assumption that the workers thread id go from 1 to Hard limit set by TBB market::global_market + const std::size_t worker_idx = my_server.my_n_thread - this->my_index; + my_handle = thread_monitor::launch(thread_routine, this, my_server.my_stack_size, &worker_idx); +#elif __TBB_USE_POSIX + { + affinity_helper fpa; + fpa.protect_affinity_mask( /*restore_process_mask=*/true); + my_handle = thread_monitor::launch(thread_routine, this, my_server.my_stack_size); + // Implicit destruction of fpa resets original affinity mask. + } +#endif /* __TBB_USE_POSIX */ + state = st_starting; + if (!my_state.compare_exchange_strong(state, st_normal)) { + // Do shutdown during startup. my_handle can't be released + // by start_shutdown, because my_handle value might be not set yet + // at time of transition from st_starting to st_quit. + __TBB_ASSERT(state == st_quit, nullptr); + release_handle(my_handle, governor::does_client_join_workers(my_client)); + } + } + break; + default: + __TBB_ASSERT(state == st_quit, nullptr); + } +} + +//------------------------------------------------------------------------ +// Methods of private_server +//------------------------------------------------------------------------ +private_server::private_server( tbb_client& client ) : + my_client(client), + my_n_thread(client.max_job_count()), + my_stack_size(client.min_stack_size()), + my_slack(0), + my_ref_count(my_n_thread+1), + my_thread_array(nullptr), + my_asleep_list_root(nullptr) +#if TBB_USE_ASSERT + , my_net_slack_requests(0) +#endif /* TBB_USE_ASSERT */ +{ + my_thread_array = tbb::cache_aligned_allocator().allocate( my_n_thread ); + for( std::size_t i=0; imy_next = my_asleep_list_root.load(std::memory_order_relaxed); + my_asleep_list_root.store(t, std::memory_order_relaxed); + } +} + +private_server::~private_server() { + __TBB_ASSERT( my_net_slack_requests==0, nullptr); + for( std::size_t i=my_n_thread; i--; ) + my_thread_array[i].~padded_private_worker(); + tbb::cache_aligned_allocator().deallocate( my_thread_array, my_n_thread ); + tbb::detail::poison_pointer( my_thread_array ); +} + +inline bool private_server::try_insert_in_asleep_list( private_worker& t ) { + asleep_list_mutex_type::scoped_lock lock; + if( !lock.try_acquire(my_asleep_list_mutex) ) + return false; + // Contribute to slack under lock so that if another takes that unit of slack, + // it sees us sleeping on the list and wakes us up. + auto expected = my_slack.load(std::memory_order_relaxed); + while (expected < 0) { + if (my_slack.compare_exchange_strong(expected, expected + 1)) { + t.my_next = my_asleep_list_root.load(std::memory_order_relaxed); + my_asleep_list_root.store(&t, std::memory_order_relaxed); + return true; + } + } + + return false; +} + +void private_server::wake_some( int additional_slack ) { + __TBB_ASSERT( additional_slack>=0, nullptr ); + private_worker* wakee[2]; + private_worker**w = wakee; + + if (additional_slack) { + // Contribute our unused slack to my_slack. + my_slack += additional_slack; + } + + int allotted_slack = 0; + while (allotted_slack < 2) { + // Chain reaction; Try to claim unit of slack + int old = my_slack.load(std::memory_order_relaxed); + do { + if (old <= 0) goto done; + } while (!my_slack.compare_exchange_strong(old, old - 1)); + ++allotted_slack; + } +done: + + if (allotted_slack) { + asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex); + auto root = my_asleep_list_root.load(std::memory_order_relaxed); + while( root && wmy_next; + } + my_asleep_list_root.store(root, std::memory_order_relaxed); + if(allotted_slack) { + // Contribute our unused slack to my_slack. + my_slack += allotted_slack; + } + } + while( w>wakee ) { + private_worker* ww = *--w; + ww->my_next = nullptr; + ww->wake_or_launch(); + } +} + +void private_server::adjust_job_count_estimate( int delta ) { +#if TBB_USE_ASSERT + my_net_slack_requests+=delta; +#endif /* TBB_USE_ASSERT */ + if( delta<0 ) { + my_slack+=delta; + } else if( delta>0 ) { + wake_some( delta ); + } +} + +//! Factory method called from task.cpp to create a private_server. +tbb_server* make_private_server( tbb_client& client ) { + return new( tbb::cache_aligned_allocator().allocate(1) ) private_server(client); +} + +} // namespace rml +} // namespace r1 +} // namespace detail +} // namespace tbb + diff --git a/third_party/tbb/profiling.cpp b/third_party/tbb/profiling.cpp new file mode 100644 index 000000000..21ed67b53 --- /dev/null +++ b/third_party/tbb/profiling.cpp @@ -0,0 +1,268 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_template_helpers.h" + +#include "third_party/tbb/main.h" +#include "third_party/tbb/itt_notify.h" + +#include "third_party/tbb/profiling.h" + +#include "libc/mem/alg.h" +#include "libc/mem/mem.h" +#include "libc/str/str.h" + +namespace tbb { +namespace detail { +namespace r1 { + +#if __TBB_USE_ITT_NOTIFY +bool ITT_Present; +static std::atomic ITT_InitializationDone; + +static __itt_domain *tbb_domains[d1::ITT_NUM_DOMAINS] = {}; + +struct resource_string { + const char *str; + __itt_string_handle *itt_str_handle; +}; + +// +// populate resource strings +// +#define TBB_STRING_RESOURCE( index_name, str ) { str, nullptr }, +static resource_string strings_for_itt[] = { + #include "third_party/tbb/detail/_string_resource.h" + { "num_resource_strings", nullptr } +}; +#undef TBB_STRING_RESOURCE + +static __itt_string_handle* ITT_get_string_handle(std::uintptr_t idx) { + __TBB_ASSERT(idx < NUM_STRINGS, "string handle out of valid range"); + return idx < NUM_STRINGS ? strings_for_itt[idx].itt_str_handle : nullptr; +} + +static void ITT_init_domains() { + tbb_domains[d1::ITT_DOMAIN_MAIN] = __itt_domain_create( _T("tbb") ); + tbb_domains[d1::ITT_DOMAIN_MAIN]->flags = 1; + tbb_domains[d1::ITT_DOMAIN_FLOW] = __itt_domain_create( _T("tbb.flow") ); + tbb_domains[d1::ITT_DOMAIN_FLOW]->flags = 1; + tbb_domains[d1::ITT_DOMAIN_ALGO] = __itt_domain_create( _T("tbb.algorithm") ); + tbb_domains[d1::ITT_DOMAIN_ALGO]->flags = 1; +} + +static void ITT_init_strings() { + for ( std::uintptr_t i = 0; i < NUM_STRINGS; ++i ) { +#if _WIN32||_WIN64 + strings_for_itt[i].itt_str_handle = __itt_string_handle_createA( strings_for_itt[i].str ); +#else + strings_for_itt[i].itt_str_handle = __itt_string_handle_create( strings_for_itt[i].str ); +#endif + } +} + +static void ITT_init() { + ITT_init_domains(); + ITT_init_strings(); +} + +/** Thread-unsafe lazy one-time initialization of tools interop. + Used by both dummy handlers and general TBB one-time initialization routine. **/ +void ITT_DoUnsafeOneTimeInitialization () { + // Double check ITT_InitializationDone is necessary because the first check + // in ITT_DoOneTimeInitialization is not guarded with the __TBB_InitOnce lock. + if ( !ITT_InitializationDone ) { + ITT_Present = (__TBB_load_ittnotify()!=0); + if (ITT_Present) ITT_init(); + ITT_InitializationDone = true; + } +} + +/** Thread-safe lazy one-time initialization of tools interop. + Used by dummy handlers only. **/ +extern "C" +void ITT_DoOneTimeInitialization() { + if ( !ITT_InitializationDone ) { + __TBB_InitOnce::lock(); + ITT_DoUnsafeOneTimeInitialization(); + __TBB_InitOnce::unlock(); + } +} + +void create_itt_sync(void* ptr, const tchar* objtype, const tchar* objname) { + ITT_SYNC_CREATE(ptr, objtype, objname); +} + +void call_itt_notify(int t, void *ptr) { + switch (t) { + case 0: ITT_NOTIFY(sync_prepare, ptr); break; + case 1: ITT_NOTIFY(sync_cancel, ptr); break; + case 2: ITT_NOTIFY(sync_acquired, ptr); break; + case 3: ITT_NOTIFY(sync_releasing, ptr); break; + case 4: ITT_NOTIFY(sync_destroy, ptr); break; + } +} + +void itt_set_sync_name(void* obj, const tchar* name) { + __itt_sync_rename(obj, name); +} + +const __itt_id itt_null_id = { 0, 0, 0 }; + +static inline __itt_domain* get_itt_domain(d1::itt_domain_enum idx) { + if (tbb_domains[idx] == nullptr) { + ITT_DoOneTimeInitialization(); + } + return tbb_domains[idx]; +} + +static inline void itt_id_make(__itt_id* id, void* addr, unsigned long long extra) { + *id = __itt_id_make(addr, extra); +} + +static inline void itt_id_create(const __itt_domain* domain, __itt_id id) { + __itt_id_create(domain, id); +} + +void itt_make_task_group(d1::itt_domain_enum domain, void* group, unsigned long long group_extra, + void* parent, unsigned long long parent_extra, string_resource_index name_index) { + if (__itt_domain* d = get_itt_domain(domain)) { + __itt_id group_id = itt_null_id; + __itt_id parent_id = itt_null_id; + itt_id_make(&group_id, group, group_extra); + itt_id_create(d, group_id); + if (parent) { + itt_id_make(&parent_id, parent, parent_extra); + } + __itt_string_handle* n = ITT_get_string_handle(name_index); + __itt_task_group(d, group_id, parent_id, n); + } +} + +void __TBB_EXPORTED_FUNC itt_metadata_str_add(d1::itt_domain_enum domain, void *addr, unsigned long long addr_extra, + string_resource_index key, const char *value ) { + if ( __itt_domain *d = get_itt_domain( domain ) ) { + __itt_id id = itt_null_id; + itt_id_make( &id, addr, addr_extra ); + __itt_string_handle *k = ITT_get_string_handle(key); + size_t value_length = strlen( value ); +#if _WIN32||_WIN64 + __itt_metadata_str_addA(d, id, k, value, value_length); +#else + __itt_metadata_str_add(d, id, k, value, value_length); +#endif + } +} + +void __TBB_EXPORTED_FUNC itt_metadata_ptr_add(d1::itt_domain_enum domain, void *addr, unsigned long long addr_extra, + string_resource_index key, void *value ) { + if ( __itt_domain *d = get_itt_domain( domain ) ) { + __itt_id id = itt_null_id; + itt_id_make( &id, addr, addr_extra ); + __itt_string_handle *k = ITT_get_string_handle(key); +#if __TBB_x86_32 + __itt_metadata_add(d, id, k, __itt_metadata_u32, 1, value); +#else + __itt_metadata_add(d, id, k, __itt_metadata_u64, 1, value); +#endif + } +} + +void __TBB_EXPORTED_FUNC itt_relation_add(d1::itt_domain_enum domain, void *addr0, unsigned long long addr0_extra, + itt_relation relation, void *addr1, unsigned long long addr1_extra ) { + if ( __itt_domain *d = get_itt_domain( domain ) ) { + __itt_id id0 = itt_null_id; + __itt_id id1 = itt_null_id; + itt_id_make( &id0, addr0, addr0_extra ); + itt_id_make( &id1, addr1, addr1_extra ); + __itt_relation_add( d, id0, (__itt_relation)relation, id1 ); + } +} + +void __TBB_EXPORTED_FUNC itt_task_begin(d1::itt_domain_enum domain, void* task, unsigned long long task_extra, + void* parent, unsigned long long parent_extra, string_resource_index name_index) { + if (__itt_domain* d = get_itt_domain(domain)) { + __itt_id task_id = itt_null_id; + __itt_id parent_id = itt_null_id; + if (task) { + itt_id_make(&task_id, task, task_extra); + } + if (parent) { + itt_id_make(&parent_id, parent, parent_extra); + } + __itt_string_handle* n = ITT_get_string_handle(name_index); + __itt_task_begin(d, task_id, parent_id, n); + } +} + +void __TBB_EXPORTED_FUNC itt_task_end(d1::itt_domain_enum domain) { + if (__itt_domain* d = get_itt_domain(domain)) { + __itt_task_end(d); + } +} + +void __TBB_EXPORTED_FUNC itt_region_begin(d1::itt_domain_enum domain, void *region, unsigned long long region_extra, + void *parent, unsigned long long parent_extra, string_resource_index /* name_index */ ) { + if ( __itt_domain *d = get_itt_domain( domain ) ) { + __itt_id region_id = itt_null_id; + __itt_id parent_id = itt_null_id; + itt_id_make( ®ion_id, region, region_extra ); + if ( parent ) { + itt_id_make( &parent_id, parent, parent_extra ); + } + __itt_region_begin( d, region_id, parent_id, nullptr ); + } +} + +void __TBB_EXPORTED_FUNC itt_region_end(d1::itt_domain_enum domain, void *region, unsigned long long region_extra ) { + if ( __itt_domain *d = get_itt_domain( domain ) ) { + __itt_id region_id = itt_null_id; + itt_id_make( ®ion_id, region, region_extra ); + __itt_region_end( d, region_id ); + } +} + +#else +void create_itt_sync(void* /*ptr*/, const tchar* /*objtype*/, const tchar* /*objname*/) {} +void call_itt_notify(int /*t*/, void* /*ptr*/) {} +void itt_set_sync_name(void* /*obj*/, const tchar* /*name*/) {} +void itt_make_task_group(d1::itt_domain_enum /*domain*/, void* /*group*/, unsigned long long /*group_extra*/, + void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/) {} +void itt_metadata_str_add(d1::itt_domain_enum /*domain*/, void* /*addr*/, unsigned long long /*addr_extra*/, + string_resource_index /*key*/, const char* /*value*/ ) { } +void itt_metadata_ptr_add(d1::itt_domain_enum /*domain*/, void * /*addr*/, unsigned long long /*addr_extra*/, + string_resource_index /*key*/, void * /*value*/ ) {} +void itt_relation_add(d1::itt_domain_enum /*domain*/, void* /*addr0*/, unsigned long long /*addr0_extra*/, + itt_relation /*relation*/, void* /*addr1*/, unsigned long long /*addr1_extra*/ ) { } +void itt_task_begin(d1::itt_domain_enum /*domain*/, void* /*task*/, unsigned long long /*task_extra*/, + void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/ ) { } +void itt_task_end(d1::itt_domain_enum /*domain*/ ) { } +void itt_region_begin(d1::itt_domain_enum /*domain*/, void* /*region*/, unsigned long long /*region_extra*/, + void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/ ) { } +void itt_region_end(d1::itt_domain_enum /*domain*/, void* /*region*/, unsigned long long /*region_extra*/ ) { } +#endif /* __TBB_USE_ITT_NOTIFY */ + +const tchar + *SyncType_Scheduler = _T("%Constant") + ; +const tchar + *SyncObj_ContextsList = _T("TBB Scheduler") + ; +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/profiling.h b/third_party/tbb/profiling.h new file mode 100644 index 000000000..707df8ce4 --- /dev/null +++ b/third_party/tbb/profiling.h @@ -0,0 +1,259 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_profiling_H +#define __TBB_profiling_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/libcxx/cstdint" + +#include "third_party/libcxx/string" + +namespace tbb { +namespace detail { +inline namespace d0 { + // include list of index names + #define TBB_STRING_RESOURCE(index_name,str) index_name, + enum string_resource_index : std::uintptr_t { + #include "third_party/tbb/detail/_string_resource.h" + NUM_STRINGS + }; + #undef TBB_STRING_RESOURCE + + enum itt_relation + { + __itt_relation_is_unknown = 0, + __itt_relation_is_dependent_on, /**< "A is dependent on B" means that A cannot start until B completes */ + __itt_relation_is_sibling_of, /**< "A is sibling of B" means that A and B were created as a group */ + __itt_relation_is_parent_of, /**< "A is parent of B" means that A created B */ + __itt_relation_is_continuation_of, /**< "A is continuation of B" means that A assumes the dependencies of B */ + __itt_relation_is_child_of, /**< "A is child of B" means that A was created by B (inverse of is_parent_of) */ + __itt_relation_is_continued_by, /**< "A is continued by B" means that B assumes the dependencies of A (inverse of is_continuation_of) */ + __itt_relation_is_predecessor_to /**< "A is predecessor to B" means that B cannot start until A completes (inverse of is_dependent_on) */ + }; + +//! Unicode support +#if (_WIN32||_WIN64) + //! Unicode character type. Always wchar_t on Windows. + using tchar = wchar_t; +#else /* !WIN */ + using tchar = char; +#endif /* !WIN */ + +} // namespace d0 +} // namespace detail +} // namespace tbb + +#include "third_party/libcxx/atomic" +#if _WIN32||_WIN64 +#include "libc/calls/calls.h" +#include "libc/calls/termios.h" +#include "libc/fmt/conv.h" +#include "libc/limits.h" +#include "libc/mem/alg.h" +#include "libc/mem/alloca.h" +#include "libc/mem/mem.h" +#include "libc/runtime/runtime.h" +#include "libc/stdio/dprintf.h" +#include "libc/stdio/rand.h" +#include "libc/stdio/temp.h" +#include "libc/str/str.h" +#include "libc/sysv/consts/exit.h" +#include "third_party/getopt/getopt.h" +#include "third_party/musl/crypt.h" +#include "third_party/musl/rand48.h" /* mbstowcs_s */ +#endif +// Need these to work regardless of tools support +namespace tbb { +namespace detail { +namespace d1 { + enum notify_type {prepare=0, cancel, acquired, releasing, destroy}; + enum itt_domain_enum { ITT_DOMAIN_FLOW=0, ITT_DOMAIN_MAIN=1, ITT_DOMAIN_ALGO=2, ITT_NUM_DOMAINS }; +} // namespace d1 + +namespace r1 { + TBB_EXPORT void __TBB_EXPORTED_FUNC call_itt_notify(int t, void* ptr); + TBB_EXPORT void __TBB_EXPORTED_FUNC create_itt_sync(void* ptr, const tchar* objtype, const tchar* objname); + TBB_EXPORT void __TBB_EXPORTED_FUNC itt_make_task_group(d1::itt_domain_enum domain, void* group, unsigned long long group_extra, + void* parent, unsigned long long parent_extra, string_resource_index name_index); + TBB_EXPORT void __TBB_EXPORTED_FUNC itt_task_begin(d1::itt_domain_enum domain, void* task, unsigned long long task_extra, + void* parent, unsigned long long parent_extra, string_resource_index name_index); + TBB_EXPORT void __TBB_EXPORTED_FUNC itt_task_end(d1::itt_domain_enum domain); + TBB_EXPORT void __TBB_EXPORTED_FUNC itt_set_sync_name(void* obj, const tchar* name); + TBB_EXPORT void __TBB_EXPORTED_FUNC itt_metadata_str_add(d1::itt_domain_enum domain, void* addr, unsigned long long addr_extra, + string_resource_index key, const char* value); + TBB_EXPORT void __TBB_EXPORTED_FUNC itt_metadata_ptr_add(d1::itt_domain_enum domain, void* addr, unsigned long long addr_extra, + string_resource_index key, void* value); + TBB_EXPORT void __TBB_EXPORTED_FUNC itt_relation_add(d1::itt_domain_enum domain, void* addr0, unsigned long long addr0_extra, + itt_relation relation, void* addr1, unsigned long long addr1_extra); + TBB_EXPORT void __TBB_EXPORTED_FUNC itt_region_begin(d1::itt_domain_enum domain, void* region, unsigned long long region_extra, + void* parent, unsigned long long parent_extra, string_resource_index /* name_index */); + TBB_EXPORT void __TBB_EXPORTED_FUNC itt_region_end(d1::itt_domain_enum domain, void* region, unsigned long long region_extra); +} // namespace r1 + +namespace d1 { +#if TBB_USE_PROFILING_TOOLS && (_WIN32||_WIN64) + inline std::size_t multibyte_to_widechar(wchar_t* wcs, const char* mbs, std::size_t bufsize) { + std::size_t len; + mbstowcs_s(&len, wcs, bufsize, mbs, _TRUNCATE); + return len; // mbstowcs_s counts null terminator + } +#endif + +#if TBB_USE_PROFILING_TOOLS + inline void create_itt_sync(void *ptr, const char *objtype, const char *objname) { +#if (_WIN32||_WIN64) + std::size_t len_type = multibyte_to_widechar(nullptr, objtype, 0); + wchar_t *type = new wchar_t[len_type]; + multibyte_to_widechar(type, objtype, len_type); + std::size_t len_name = multibyte_to_widechar(nullptr, objname, 0); + wchar_t *name = new wchar_t[len_name]; + multibyte_to_widechar(name, objname, len_name); +#else // WIN + const char *type = objtype; + const char *name = objname; +#endif + r1::create_itt_sync(ptr, type, name); + +#if (_WIN32||_WIN64) + delete[] type; + delete[] name; +#endif // WIN + } + +// Distinguish notifications on task for reducing overheads +#if TBB_USE_PROFILING_TOOLS == 2 + inline void call_itt_task_notify(d1::notify_type t, void *ptr) { + r1::call_itt_notify(static_cast(t), ptr); + } +#else + inline void call_itt_task_notify(d1::notify_type, void *) {} +#endif // TBB_USE_PROFILING_TOOLS + + inline void call_itt_notify(d1::notify_type t, void *ptr) { + r1::call_itt_notify(static_cast(t), ptr); + } + +#if (_WIN32||_WIN64) && !__MINGW32__ + inline void itt_set_sync_name(void* obj, const wchar_t* name) { + r1::itt_set_sync_name(obj, name); + } + inline void itt_set_sync_name(void* obj, const char* name) { + std::size_t len_name = multibyte_to_widechar(nullptr, name, 0); + wchar_t *obj_name = new wchar_t[len_name]; + multibyte_to_widechar(obj_name, name, len_name); + r1::itt_set_sync_name(obj, obj_name); + delete[] obj_name; + } +#else + inline void itt_set_sync_name( void* obj, const char* name) { + r1::itt_set_sync_name(obj, name); + } +#endif //WIN + + inline void itt_make_task_group(itt_domain_enum domain, void* group, unsigned long long group_extra, + void* parent, unsigned long long parent_extra, string_resource_index name_index) { + r1::itt_make_task_group(domain, group, group_extra, parent, parent_extra, name_index); + } + + inline void itt_metadata_str_add( itt_domain_enum domain, void *addr, unsigned long long addr_extra, + string_resource_index key, const char *value ) { + r1::itt_metadata_str_add( domain, addr, addr_extra, key, value ); + } + + inline void register_node_addr(itt_domain_enum domain, void *addr, unsigned long long addr_extra, + string_resource_index key, void *value) { + r1::itt_metadata_ptr_add(domain, addr, addr_extra, key, value); + } + + inline void itt_relation_add( itt_domain_enum domain, void *addr0, unsigned long long addr0_extra, + itt_relation relation, void *addr1, unsigned long long addr1_extra ) { + r1::itt_relation_add( domain, addr0, addr0_extra, relation, addr1, addr1_extra ); + } + + inline void itt_task_begin( itt_domain_enum domain, void *task, unsigned long long task_extra, + void *parent, unsigned long long parent_extra, string_resource_index name_index ) { + r1::itt_task_begin( domain, task, task_extra, parent, parent_extra, name_index ); + } + + inline void itt_task_end( itt_domain_enum domain ) { + r1::itt_task_end( domain ); + } + + inline void itt_region_begin( itt_domain_enum domain, void *region, unsigned long long region_extra, + void *parent, unsigned long long parent_extra, string_resource_index name_index ) { + r1::itt_region_begin( domain, region, region_extra, parent, parent_extra, name_index ); + } + + inline void itt_region_end( itt_domain_enum domain, void *region, unsigned long long region_extra ) { + r1::itt_region_end( domain, region, region_extra ); + } +#else + inline void create_itt_sync(void* /*ptr*/, const char* /*objtype*/, const char* /*objname*/) {} + + inline void call_itt_notify(notify_type /*t*/, void* /*ptr*/) {} + + inline void call_itt_task_notify(notify_type /*t*/, void* /*ptr*/) {} +#endif // TBB_USE_PROFILING_TOOLS + +#if TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2) +class event { +/** This class supports user event traces through itt. + Common use-case is tagging data flow graph tasks (data-id) + and visualization by Intel Advisor Flow Graph Analyzer (FGA) **/ +// TODO: Replace implementation by itt user event api. + + const std::string my_name; + + static void emit_trace(const std::string &input) { + itt_metadata_str_add( ITT_DOMAIN_FLOW, nullptr, FLOW_NULL, USER_EVENT, ( "FGA::DATAID::" + input ).c_str() ); + } + +public: + event(const std::string &input) + : my_name( input ) + { } + + void emit() { + emit_trace(my_name); + } + + static void emit(const std::string &description) { + emit_trace(description); + } + +}; +#else // TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2) +// Using empty struct if user event tracing is disabled: +struct event { + event(const std::string &) { } + + void emit() { } + + static void emit(const std::string &) { } +}; +#endif // TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2) +} // namespace d1 +} // namespace detail + +namespace profiling { + using detail::d1::event; +} +} // namespace tbb + + +#endif /* __TBB_profiling_H */ diff --git a/third_party/tbb/queuing_mutex.h b/third_party/tbb/queuing_mutex.h new file mode 100644 index 000000000..0636b667e --- /dev/null +++ b/third_party/tbb/queuing_mutex.h @@ -0,0 +1,193 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_queuing_mutex_H +#define __TBB_queuing_mutex_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_mutex_common.h" + +#include "third_party/tbb/profiling.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace d1 { + +//! Queuing mutex with local-only spinning. +/** @ingroup synchronization */ +class queuing_mutex { +public: + //! Construct unacquired mutex. + queuing_mutex() noexcept { + create_itt_sync(this, "tbb::queuing_mutex", ""); + }; + + queuing_mutex(const queuing_mutex&) = delete; + queuing_mutex& operator=(const queuing_mutex&) = delete; + + //! The scoped locking pattern + /** It helps to avoid the common problem of forgetting to release lock. + It also nicely provides the "node" for queuing locks. */ + class scoped_lock { + //! Reset fields to mean "no lock held". + void reset() { + m_mutex = nullptr; + } + + public: + //! Construct lock that has not acquired a mutex. + /** Equivalent to zero-initialization of *this. */ + scoped_lock() = default; + + //! Acquire lock on given mutex. + scoped_lock(queuing_mutex& m) { + acquire(m); + } + + //! Release lock (if lock is held). + ~scoped_lock() { + if (m_mutex) release(); + } + + //! No Copy + scoped_lock( const scoped_lock& ) = delete; + scoped_lock& operator=( const scoped_lock& ) = delete; + + //! Acquire lock on given mutex. + void acquire( queuing_mutex& m ) { + __TBB_ASSERT(!m_mutex, "scoped_lock is already holding a mutex"); + + // Must set all fields before the exchange, because once the + // exchange executes, *this becomes accessible to other threads. + m_mutex = &m; + m_next.store(nullptr, std::memory_order_relaxed); + m_going.store(0U, std::memory_order_relaxed); + + // x86 compare exchange operation always has a strong fence + // "sending" the fields initialized above to other processors. + scoped_lock* pred = m.q_tail.exchange(this); + if (pred) { + call_itt_notify(prepare, &m); + __TBB_ASSERT(pred->m_next.load(std::memory_order_relaxed) == nullptr, "the predecessor has another successor!"); + + pred->m_next.store(this, std::memory_order_release); + spin_wait_while_eq(m_going, 0U); + } + call_itt_notify(acquired, &m); + + } + + //! Acquire lock on given mutex if free (i.e. non-blocking) + bool try_acquire( queuing_mutex& m ) { + __TBB_ASSERT(!m_mutex, "scoped_lock is already holding a mutex"); + + // Must set all fields before the compare_exchange_strong, because once the + // compare_exchange_strong executes, *this becomes accessible to other threads. + m_next.store(nullptr, std::memory_order_relaxed); + m_going.store(0U, std::memory_order_relaxed); + + scoped_lock* expected = nullptr; + // The compare_exchange_strong must have release semantics, because we are + // "sending" the fields initialized above to other processors. + // x86 compare exchange operation always has a strong fence + if (!m.q_tail.compare_exchange_strong(expected, this, std::memory_order_acq_rel)) + return false; + + m_mutex = &m; + + call_itt_notify(acquired, &m); + return true; + } + + //! Release lock. + void release() + { + __TBB_ASSERT(this->m_mutex, "no lock acquired"); + + call_itt_notify(releasing, this->m_mutex); + + if (m_next.load(std::memory_order_relaxed) == nullptr) { + scoped_lock* expected = this; + if (m_mutex->q_tail.compare_exchange_strong(expected, nullptr)) { + // this was the only item in the queue, and the queue is now empty. + reset(); + return; + } + // Someone in the queue + spin_wait_while_eq(m_next, nullptr); + } + m_next.load(std::memory_order_acquire)->m_going.store(1U, std::memory_order_release); + + reset(); + } + + private: + //! The pointer to the mutex owned, or nullptr if not holding a mutex. + queuing_mutex* m_mutex{nullptr}; + + //! The pointer to the next competitor for a mutex + std::atomic m_next{nullptr}; + + //! The local spin-wait variable + /** Inverted (0 - blocked, 1 - acquired the mutex) for the sake of + zero-initialization. Defining it as an entire word instead of + a byte seems to help performance slightly. */ + std::atomic m_going{0U}; + }; + + // Mutex traits + static constexpr bool is_rw_mutex = false; + static constexpr bool is_recursive_mutex = false; + static constexpr bool is_fair_mutex = true; + +private: + //! The last competitor requesting the lock + std::atomic q_tail{nullptr}; + +}; + +#if TBB_USE_PROFILING_TOOLS +inline void set_name(queuing_mutex& obj, const char* name) { + itt_set_sync_name(&obj, name); +} +#if (_WIN32||_WIN64) +inline void set_name(queuing_mutex& obj, const wchar_t* name) { + itt_set_sync_name(&obj, name); +} +#endif //WIN +#else +inline void set_name(queuing_mutex&, const char*) {} +#if (_WIN32||_WIN64) +inline void set_name(queuing_mutex&, const wchar_t*) {} +#endif //WIN +#endif +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::queuing_mutex; +} // namespace v1 +namespace profiling { + using detail::d1::set_name; +} +} // namespace tbb + +#endif /* __TBB_queuing_mutex_H */ diff --git a/third_party/tbb/queuing_rw_mutex.cpp b/third_party/tbb/queuing_rw_mutex.cpp new file mode 100644 index 000000000..675484a40 --- /dev/null +++ b/third_party/tbb/queuing_rw_mutex.cpp @@ -0,0 +1,618 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +/** Before making any changes in the implementation, please emulate algorithmic changes + with SPIN tool using /tools/spin_models/ReaderWriterMutex.pml. + There could be some code looking as "can be restructured" but its structure does matter! */ + +#include "third_party/tbb/queuing_rw_mutex.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/itt_notify.h" + +namespace tbb { +namespace detail { +namespace r1 { + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) + // Workaround for overzealous compiler warnings + #pragma warning (push) + #pragma warning (disable: 4311 4312) +#endif + +//! A view of a T* with additional functionality for twiddling low-order bits. +template +class tricky_atomic_pointer { +public: + using word = uintptr_t; + + static T* fetch_add( std::atomic& location, word addend, std::memory_order memory_order ) { + return reinterpret_cast(location.fetch_add(addend, memory_order)); + } + + static T* exchange( std::atomic& location, T* value, std::memory_order memory_order ) { + return reinterpret_cast(location.exchange(reinterpret_cast(value), memory_order)); + } + + static T* compare_exchange_strong( std::atomic& obj, const T* expected, const T* desired, std::memory_order memory_order ) { + word expd = reinterpret_cast(expected); + obj.compare_exchange_strong(expd, reinterpret_cast(desired), memory_order); + return reinterpret_cast(expd); + } + + static void store( std::atomic& location, const T* value, std::memory_order memory_order ) { + location.store(reinterpret_cast(value), memory_order); + } + + static T* load( std::atomic& location, std::memory_order memory_order ) { + return reinterpret_cast(location.load(memory_order)); + } + + static void spin_wait_while_eq(const std::atomic& location, const T* value) { + tbb::detail::d0::spin_wait_while_eq(location, reinterpret_cast(value) ); + } + + T* & ref; + tricky_atomic_pointer( T*& original ) : ref(original) {}; + tricky_atomic_pointer(const tricky_atomic_pointer&) = delete; + tricky_atomic_pointer& operator=(const tricky_atomic_pointer&) = delete; + T* operator&( const word operand2 ) const { + return reinterpret_cast( reinterpret_cast(ref) & operand2 ); + } + T* operator|( const word operand2 ) const { + return reinterpret_cast( reinterpret_cast(ref) | operand2 ); + } +}; + +using tricky_pointer = tricky_atomic_pointer; + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) + // Workaround for overzealous compiler warnings + #pragma warning (pop) +#endif + +//! Flag bits in a state_t that specify information about a locking request. +enum state_t_flags : unsigned char { + STATE_NONE = 0, + STATE_WRITER = 1<<0, + STATE_READER = 1<<1, + STATE_READER_UNBLOCKNEXT = 1<<2, + STATE_ACTIVEREADER = 1<<3, + STATE_UPGRADE_REQUESTED = 1<<4, + STATE_UPGRADE_WAITING = 1<<5, + STATE_UPGRADE_LOSER = 1<<6, + STATE_COMBINED_WAITINGREADER = STATE_READER | STATE_READER_UNBLOCKNEXT, + STATE_COMBINED_READER = STATE_COMBINED_WAITINGREADER | STATE_ACTIVEREADER, + STATE_COMBINED_UPGRADING = STATE_UPGRADE_WAITING | STATE_UPGRADE_LOSER +}; + +static const unsigned char RELEASED = 0; +static const unsigned char ACQUIRED = 1; + +struct queuing_rw_mutex_impl { + //! Try to acquire the internal lock + /** Returns true if lock was successfully acquired. */ + static bool try_acquire_internal_lock(d1::queuing_rw_mutex::scoped_lock& s) + { + auto expected = RELEASED; + return s.my_internal_lock.compare_exchange_strong(expected, ACQUIRED); + } + + //! Acquire the internal lock + static void acquire_internal_lock(d1::queuing_rw_mutex::scoped_lock& s) + { + // Usually, we would use the test-test-and-set idiom here, with exponential backoff. + // But so far, experiments indicate there is no value in doing so here. + while( !try_acquire_internal_lock(s) ) { + machine_pause(1); + } + } + + //! Release the internal lock + static void release_internal_lock(d1::queuing_rw_mutex::scoped_lock& s) + { + s.my_internal_lock.store(RELEASED, std::memory_order_release); + } + + //! Wait for internal lock to be released + static void wait_for_release_of_internal_lock(d1::queuing_rw_mutex::scoped_lock& s) + { + spin_wait_until_eq(s.my_internal_lock, RELEASED); + } + + //! A helper function + static void unblock_or_wait_on_internal_lock(d1::queuing_rw_mutex::scoped_lock& s, uintptr_t flag ) { + if( flag ) { + wait_for_release_of_internal_lock(s); + } + else { + release_internal_lock(s); + } + } + + //! Mask for low order bit of a pointer. + static const tricky_pointer::word FLAG = 0x1; + + static uintptr_t get_flag( d1::queuing_rw_mutex::scoped_lock* ptr ) { + return reinterpret_cast(ptr) & FLAG; + } + + //------------------------------------------------------------------------ + // Methods of queuing_rw_mutex::scoped_lock + //------------------------------------------------------------------------ + + //! A method to acquire queuing_rw_mutex lock + static void acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) + { + __TBB_ASSERT( !s.my_mutex, "scoped_lock is already holding a mutex"); + + // Must set all fields before the exchange, because once the + // exchange executes, *this becomes accessible to other threads. + s.my_mutex = &m; + s.my_prev.store(0U, std::memory_order_relaxed); + s.my_next.store(0U, std::memory_order_relaxed); + s.my_going.store(0U, std::memory_order_relaxed); + s.my_state.store(d1::queuing_rw_mutex::scoped_lock::state_t(write ? STATE_WRITER : STATE_READER), std::memory_order_relaxed); + s.my_internal_lock.store(RELEASED, std::memory_order_relaxed); + + + // The CAS must have release semantics, because we are + // "sending" the fields initialized above to other actors. + // We need acquire semantics, because we are acquiring the predecessor (or mutex if no predecessor) + queuing_rw_mutex::scoped_lock* predecessor = m.q_tail.exchange(&s, std::memory_order_acq_rel); + + if( write ) { // Acquiring for write + + if( predecessor ) { + ITT_NOTIFY(sync_prepare, s.my_mutex); + predecessor = tricky_pointer(predecessor) & ~FLAG; + __TBB_ASSERT( !predecessor->my_next, "the predecessor has another successor!"); + tricky_pointer::store(predecessor->my_next, &s, std::memory_order_release); + // We are acquiring the mutex + spin_wait_until_eq(s.my_going, 1U, std::memory_order_acquire); + } + + } else { // Acquiring for read + #if __TBB_USE_ITT_NOTIFY + bool sync_prepare_done = false; + #endif + if( predecessor ) { + unsigned char pred_state{}; + __TBB_ASSERT( !s.my_prev.load(std::memory_order_relaxed), "the predecessor is already set" ); + if( tricky_pointer(predecessor) & FLAG ) { + /* this is only possible if predecessor is an upgrading reader and it signals us to wait */ + pred_state = STATE_UPGRADE_WAITING; + predecessor = tricky_pointer(predecessor) & ~FLAG; + } else { + // Load predecessor->my_state now, because once predecessor->my_next becomes + // non-null, we must assume that *predecessor might be destroyed. + pred_state = predecessor->my_state.load(std::memory_order_relaxed); + if (pred_state == STATE_READER) { + // Notify the previous reader to unblock us. + predecessor->my_state.compare_exchange_strong(pred_state, STATE_READER_UNBLOCKNEXT, std::memory_order_relaxed); + } + if (pred_state == STATE_ACTIVEREADER) { // either we initially read it or CAS failed + // Active reader means that the predecessor already acquired the mutex and cannot notify us. + // Therefore, we need to acquire the mutex ourselves by re-reading predecessor state. + (void)predecessor->my_state.load(std::memory_order_acquire); + } + } + tricky_pointer::store(s.my_prev, predecessor, std::memory_order_relaxed); + __TBB_ASSERT( !( tricky_pointer(predecessor) & FLAG ), "use of corrupted pointer!" ); + __TBB_ASSERT( !predecessor->my_next.load(std::memory_order_relaxed), "the predecessor has another successor!"); + tricky_pointer::store(predecessor->my_next, &s, std::memory_order_release); + if( pred_state != STATE_ACTIVEREADER ) { + #if __TBB_USE_ITT_NOTIFY + sync_prepare_done = true; + ITT_NOTIFY(sync_prepare, s.my_mutex); + #endif + // We are acquiring the mutex + spin_wait_until_eq(s.my_going, 1U, std::memory_order_acquire); + } + } + + // The protected state must have been acquired here before it can be further released to any other reader(s): + unsigned char old_state = STATE_READER; + // When this reader is signaled by previous actor it acquires the mutex. + // We need to build happens-before relation with all other coming readers that will read our ACTIVEREADER + // without blocking on my_going. Therefore, we need to publish ACTIVEREADER with release semantics. + // On fail it is relaxed, because we will build happens-before on my_going. + s.my_state.compare_exchange_strong(old_state, STATE_ACTIVEREADER, std::memory_order_release, std::memory_order_relaxed); + if( old_state!=STATE_READER ) { +#if __TBB_USE_ITT_NOTIFY + if( !sync_prepare_done ) + ITT_NOTIFY(sync_prepare, s.my_mutex); +#endif + // Failed to become active reader -> need to unblock the next waiting reader first + __TBB_ASSERT( s.my_state.load(std::memory_order_relaxed)==STATE_READER_UNBLOCKNEXT, "unexpected state" ); + spin_wait_while_eq(s.my_next, 0U, std::memory_order_acquire); + /* my_state should be changed before unblocking the next otherwise it might finish + and another thread can get our old state and left blocked */ + s.my_state.store(STATE_ACTIVEREADER, std::memory_order_relaxed); + tricky_pointer::load(s.my_next, std::memory_order_relaxed)->my_going.store(1U, std::memory_order_release); + } + __TBB_ASSERT(s.my_state.load(std::memory_order_relaxed) == STATE_ACTIVEREADER, "unlocked reader is active reader"); + } + + ITT_NOTIFY(sync_acquired, s.my_mutex); + } + + //! A method to acquire queuing_rw_mutex if it is free + static bool try_acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) + { + __TBB_ASSERT( !s.my_mutex, "scoped_lock is already holding a mutex"); + + if( m.q_tail.load(std::memory_order_relaxed) ) + return false; // Someone already took the lock + + // Must set all fields before the exchange, because once the + // exchange executes, *this becomes accessible to other threads. + s.my_prev.store(0U, std::memory_order_relaxed); + s.my_next.store(0U, std::memory_order_relaxed); + s.my_going.store(0U, std::memory_order_relaxed); // TODO: remove dead assignment? + s.my_state.store(d1::queuing_rw_mutex::scoped_lock::state_t(write ? STATE_WRITER : STATE_ACTIVEREADER), std::memory_order_relaxed); + s.my_internal_lock.store(RELEASED, std::memory_order_relaxed); + + // The CAS must have release semantics, because we are + // "sending" the fields initialized above to other actors. + // We need acquire semantics, because we are acquiring the mutex + d1::queuing_rw_mutex::scoped_lock* expected = nullptr; + if (!m.q_tail.compare_exchange_strong(expected, &s, std::memory_order_acq_rel)) + return false; // Someone already took the lock + s.my_mutex = &m; + ITT_NOTIFY(sync_acquired, s.my_mutex); + return true; + } + + //! A method to release queuing_rw_mutex lock + static void release(d1::queuing_rw_mutex::scoped_lock& s) { + __TBB_ASSERT(s.my_mutex!=nullptr, "no lock acquired"); + + ITT_NOTIFY(sync_releasing, s.my_mutex); + + if( s.my_state.load(std::memory_order_relaxed) == STATE_WRITER ) { // Acquired for write + + // The logic below is the same as "writerUnlock", but elides + // "return" from the middle of the routine. + // In the statement below, acquire semantics of reading my_next is required + // so that following operations with fields of my_next are safe. + d1::queuing_rw_mutex::scoped_lock* next = tricky_pointer::load(s.my_next, std::memory_order_acquire); + if( !next ) { + d1::queuing_rw_mutex::scoped_lock* expected = &s; + // Release mutex on success otherwise wait for successor publication + if( s.my_mutex->q_tail.compare_exchange_strong(expected, nullptr, + std::memory_order_release, std::memory_order_relaxed) ) + { + // this was the only item in the queue, and the queue is now empty. + goto done; + } + spin_wait_while_eq(s.my_next, 0U, std::memory_order_relaxed); + next = tricky_pointer::load(s.my_next, std::memory_order_acquire); + } + next->my_going.store(2U, std::memory_order_relaxed); // protect next queue node from being destroyed too early + // If the next is STATE_UPGRADE_WAITING, it is expected to acquire all other released readers via release + // sequence in next->my_state. In that case, we need to preserve release sequence in next->my_state + // contributed by other reader. So, there are two approaches not to break the release sequence: + // 1. Use read-modify-write (exchange) operation to store with release the UPGRADE_LOSER state; + // 2. Acquire the release sequence and store the sequence and UPGRADE_LOSER state. + // The second approach seems better on x86 because it does not involve interlocked operations. + // Therefore, we read next->my_state with acquire while it is not required for else branch to get the + // release sequence. + if( next->my_state.load(std::memory_order_acquire)==STATE_UPGRADE_WAITING ) { + // the next waiting for upgrade means this writer was upgraded before. + acquire_internal_lock(s); + // Responsibility transition, the one who reads uncorrupted my_prev will do release. + // Guarantee that above store of 2 into next->my_going happens-before resetting of next->my_prev + d1::queuing_rw_mutex::scoped_lock* tmp = tricky_pointer::exchange(next->my_prev, nullptr, std::memory_order_release); + // Pass the release sequence that we acquired with the above load of next->my_state. + next->my_state.store(STATE_UPGRADE_LOSER, std::memory_order_release); + // We are releasing the mutex + next->my_going.store(1U, std::memory_order_release); + unblock_or_wait_on_internal_lock(s, get_flag(tmp)); + } else { + // next->state cannot be STATE_UPGRADE_REQUESTED + __TBB_ASSERT( next->my_state.load(std::memory_order_relaxed) & (STATE_COMBINED_WAITINGREADER | STATE_WRITER), "unexpected state" ); + __TBB_ASSERT( !( next->my_prev.load(std::memory_order_relaxed) & FLAG ), "use of corrupted pointer!" ); + // Guarantee that above store of 2 into next->my_going happens-before resetting of next->my_prev + tricky_pointer::store(next->my_prev, nullptr, std::memory_order_release); + // We are releasing the mutex + next->my_going.store(1U, std::memory_order_release); + } + + } else { // Acquired for read + // The basic idea it to build happens-before relation with left and right readers via prev and next. In addition, + // the first reader should acquire the left (prev) signal and propagate to right (next). To simplify, we always + // build happens-before relation between left and right (left is happened before right). + queuing_rw_mutex::scoped_lock *tmp = nullptr; + retry: + // Addition to the original paper: Mark my_prev as in use + queuing_rw_mutex::scoped_lock *predecessor = tricky_pointer::fetch_add(s.my_prev, FLAG, std::memory_order_acquire); + + if( predecessor ) { + if( !(try_acquire_internal_lock(*predecessor)) ) + { + // Failed to acquire the lock on predecessor. The predecessor either unlinks or upgrades. + // In the second case, it could or could not know my "in use" flag - need to check + // Responsibility transition, the one who reads uncorrupted my_prev will do release. + tmp = tricky_pointer::compare_exchange_strong(s.my_prev, tricky_pointer(predecessor) | FLAG, predecessor, std::memory_order_acquire); + if( !(tricky_pointer(tmp) & FLAG) ) { + __TBB_ASSERT(tricky_pointer::load(s.my_prev, std::memory_order_relaxed) != (tricky_pointer(predecessor) | FLAG), nullptr); + // Now owner of predecessor is waiting for _us_ to release its lock + release_internal_lock(*predecessor); + } + // else the "in use" flag is back -> the predecessor didn't get it and will release itself; nothing to do + + tmp = nullptr; + goto retry; + } + __TBB_ASSERT(predecessor && predecessor->my_internal_lock.load(std::memory_order_relaxed)==ACQUIRED, "predecessor's lock is not acquired"); + tricky_pointer::store(s.my_prev, predecessor, std::memory_order_relaxed); + acquire_internal_lock(s); + + tricky_pointer::store(predecessor->my_next, nullptr, std::memory_order_release); + + d1::queuing_rw_mutex::scoped_lock* expected = &s; + if( !tricky_pointer::load(s.my_next, std::memory_order_acquire) && !s.my_mutex->q_tail.compare_exchange_strong(expected, predecessor, std::memory_order_release) ) { + spin_wait_while_eq( s.my_next, 0U, std::memory_order_acquire ); + } + __TBB_ASSERT( !(s.my_next.load(std::memory_order_relaxed) & FLAG), "use of corrupted pointer" ); + + // my_next is acquired either with load or spin_wait. + if(d1::queuing_rw_mutex::scoped_lock *const l_next = tricky_pointer::load(s.my_next, std::memory_order_relaxed) ) { // I->next != nil, TODO: rename to next after clearing up and adapting the n in the comment two lines below + // Equivalent to I->next->prev = I->prev but protected against (prev[n]&FLAG)!=0 + tmp = tricky_pointer::exchange(l_next->my_prev, predecessor, std::memory_order_release); + // I->prev->next = I->next; + __TBB_ASSERT(tricky_pointer::load(s.my_prev, std::memory_order_relaxed)==predecessor, nullptr); + predecessor->my_next.store(s.my_next.load(std::memory_order_relaxed), std::memory_order_release); + } + // Safe to release in the order opposite to acquiring which makes the code simpler + release_internal_lock(*predecessor); + + } else { // No predecessor when we looked + acquire_internal_lock(s); // "exclusiveLock(&I->EL)" + d1::queuing_rw_mutex::scoped_lock* next = tricky_pointer::load(s.my_next, std::memory_order_acquire); + if( !next ) { + d1::queuing_rw_mutex::scoped_lock* expected = &s; + // Release mutex on success otherwise wait for successor publication + if( !s.my_mutex->q_tail.compare_exchange_strong(expected, nullptr, + std::memory_order_release, std::memory_order_relaxed) ) + { + spin_wait_while_eq( s.my_next, 0U, std::memory_order_relaxed ); + next = tricky_pointer::load(s.my_next, std::memory_order_acquire); + } else { + goto unlock_self; + } + } + next->my_going.store(2U, std::memory_order_relaxed); + // Responsibility transition, the one who reads uncorrupted my_prev will do release. + tmp = tricky_pointer::exchange(next->my_prev, nullptr, std::memory_order_release); + next->my_going.store(1U, std::memory_order_release); + } + unlock_self: + unblock_or_wait_on_internal_lock(s, get_flag(tmp)); + } + done: + // Lifetime synchronization, no need to build happens-before relation + spin_wait_while_eq( s.my_going, 2U, std::memory_order_relaxed ); + + s.initialize(); + } + + static bool downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock& s) { + if ( s.my_state.load(std::memory_order_relaxed) == STATE_ACTIVEREADER ) return true; // Already a reader + + ITT_NOTIFY(sync_releasing, s.my_mutex); + d1::queuing_rw_mutex::scoped_lock* next = tricky_pointer::load(s.my_next, std::memory_order_acquire); + if( !next ) { + s.my_state.store(STATE_READER, std::memory_order_seq_cst); + // the following load of q_tail must not be reordered with setting STATE_READER above + if( &s == s.my_mutex->q_tail.load(std::memory_order_seq_cst) ) { + unsigned char old_state = STATE_READER; + // When this reader is signaled by previous actor it acquires the mutex. + // We need to build happens-before relation with all other coming readers that will read our ACTIVEREADER + // without blocking on my_going. Therefore, we need to publish ACTIVEREADER with release semantics. + // On fail it is relaxed, because we will build happens-before on my_going. + s.my_state.compare_exchange_strong(old_state, STATE_ACTIVEREADER, std::memory_order_release, std::memory_order_relaxed); + if( old_state==STATE_READER ) + return true; // Downgrade completed + } + /* wait for the next to register */ + spin_wait_while_eq(s.my_next, 0U, std::memory_order_relaxed); + next = tricky_pointer::load(s.my_next, std::memory_order_acquire); + } + + __TBB_ASSERT( next, "still no successor at this point!" ); + if( next->my_state.load(std::memory_order_relaxed) & STATE_COMBINED_WAITINGREADER ) + next->my_going.store(1U, std::memory_order_release); + // If the next is STATE_UPGRADE_WAITING, it is expected to acquire all other released readers via release + // sequence in next->my_state. In that case, we need to preserve release sequence in next->my_state + // contributed by other reader. So, there are two approaches not to break the release sequence: + // 1. Use read-modify-write (exchange) operation to store with release the UPGRADE_LOSER state; + // 2. Acquire the release sequence and store the sequence and UPGRADE_LOSER state. + // The second approach seems better on x86 because it does not involve interlocked operations. + // Therefore, we read next->my_state with acquire while it is not required for else branch to get the + // release sequence. + else if( next->my_state.load(std::memory_order_acquire)==STATE_UPGRADE_WAITING ) + // the next waiting for upgrade means this writer was upgraded before. + // To safe release sequence on next->my_state read it with acquire + next->my_state.store(STATE_UPGRADE_LOSER, std::memory_order_release); + s.my_state.store(STATE_ACTIVEREADER, std::memory_order_release); + return true; + } + + static bool upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock& s) { + if (s.my_state.load(std::memory_order_relaxed) == STATE_WRITER) { + // Already a writer + return true; + } + + __TBB_ASSERT(s.my_state.load(std::memory_order_relaxed) == STATE_ACTIVEREADER, "only active reader can be updated"); + + queuing_rw_mutex::scoped_lock* tmp{}; + queuing_rw_mutex::scoped_lock* me = &s; + + ITT_NOTIFY(sync_releasing, s.my_mutex); + // Publish ourselves into my_state that other UPGRADE_WAITING actors can acquire our state. + s.my_state.store(STATE_UPGRADE_REQUESTED, std::memory_order_release); + requested: + __TBB_ASSERT( !(s.my_next.load(std::memory_order_relaxed) & FLAG), "use of corrupted pointer!" ); + acquire_internal_lock(s); + d1::queuing_rw_mutex::scoped_lock* expected = &s; + if( !s.my_mutex->q_tail.compare_exchange_strong(expected, tricky_pointer(me)|FLAG, std::memory_order_acq_rel) ) { + spin_wait_while_eq( s.my_next, 0U, std::memory_order_relaxed ); + queuing_rw_mutex::scoped_lock * next; + next = tricky_pointer::fetch_add(s.my_next, FLAG, std::memory_order_acquire); + // While we were READER the next READER might reach STATE_UPGRADE_WAITING state. + // Therefore, it did not build happens before relation with us and we need to acquire the + // next->my_state to build the happens before relation ourselves + unsigned short n_state = next->my_state.load(std::memory_order_acquire); + /* the next reader can be blocked by our state. the best thing to do is to unblock it */ + if( n_state & STATE_COMBINED_WAITINGREADER ) + next->my_going.store(1U, std::memory_order_release); + // Responsibility transition, the one who reads uncorrupted my_prev will do release. + tmp = tricky_pointer::exchange(next->my_prev, &s, std::memory_order_release); + unblock_or_wait_on_internal_lock(s, get_flag(tmp)); + if( n_state & (STATE_COMBINED_READER | STATE_UPGRADE_REQUESTED) ) { + // save next|FLAG for simplicity of following comparisons + tmp = tricky_pointer(next)|FLAG; + for( atomic_backoff b; tricky_pointer::load(s.my_next, std::memory_order_relaxed)==tmp; b.pause() ) { + if( s.my_state.load(std::memory_order_acquire) & STATE_COMBINED_UPGRADING ) { + if( tricky_pointer::load(s.my_next, std::memory_order_acquire)==tmp ) + tricky_pointer::store(s.my_next, next, std::memory_order_relaxed); + goto waiting; + } + } + __TBB_ASSERT(tricky_pointer::load(s.my_next, std::memory_order_relaxed) != (tricky_pointer(next)|FLAG), nullptr); + goto requested; + } else { + __TBB_ASSERT( n_state & (STATE_WRITER | STATE_UPGRADE_WAITING), "unexpected state"); + __TBB_ASSERT( (tricky_pointer(next)|FLAG) == tricky_pointer::load(s.my_next, std::memory_order_relaxed), nullptr); + tricky_pointer::store(s.my_next, next, std::memory_order_relaxed); + } + } else { + /* We are in the tail; whoever comes next is blocked by q_tail&FLAG */ + release_internal_lock(s); + } // if( this != my_mutex->q_tail... ) + { + unsigned char old_state = STATE_UPGRADE_REQUESTED; + // If we reach STATE_UPGRADE_WAITING state we do not build happens-before relation with READER on + // left. We delegate this responsibility to READER on left when it try upgrading. Therefore, we are releasing + // on success. + // Otherwise, on fail, we already acquired the next->my_state. + s.my_state.compare_exchange_strong(old_state, STATE_UPGRADE_WAITING, std::memory_order_release, std::memory_order_relaxed); + } + waiting: + __TBB_ASSERT( !( s.my_next.load(std::memory_order_relaxed) & FLAG ), "use of corrupted pointer!" ); + __TBB_ASSERT( s.my_state & STATE_COMBINED_UPGRADING, "wrong state at upgrade waiting_retry" ); + __TBB_ASSERT( me==&s, nullptr ); + ITT_NOTIFY(sync_prepare, s.my_mutex); + /* if no one was blocked by the "corrupted" q_tail, turn it back */ + expected = tricky_pointer(me)|FLAG; + s.my_mutex->q_tail.compare_exchange_strong(expected, &s, std::memory_order_release); + queuing_rw_mutex::scoped_lock * predecessor; + // Mark my_prev as 'in use' to prevent predecessor from releasing + predecessor = tricky_pointer::fetch_add(s.my_prev, FLAG, std::memory_order_acquire); + if( predecessor ) { + bool success = try_acquire_internal_lock(*predecessor); + { + // While the predecessor pointer (my_prev) is in use (FLAG is set), we can safely update the node`s state. + // Corrupted pointer transitions responsibility to release the predecessor`s node on us. + unsigned char old_state = STATE_UPGRADE_REQUESTED; + // Try to build happens before with the upgrading READER on left. If fail, the predecessor state is not + // important for us because it will acquire our state. + predecessor->my_state.compare_exchange_strong(old_state, STATE_UPGRADE_WAITING, std::memory_order_release, + std::memory_order_relaxed); + } + if( !success ) { + // Responsibility transition, the one who reads uncorrupted my_prev will do release. + tmp = tricky_pointer::compare_exchange_strong(s.my_prev, tricky_pointer(predecessor)|FLAG, predecessor, std::memory_order_acquire); + if( tricky_pointer(tmp) & FLAG ) { + tricky_pointer::spin_wait_while_eq(s.my_prev, predecessor); + predecessor = tricky_pointer::load(s.my_prev, std::memory_order_relaxed); + } else { + // TODO: spin_wait condition seems never reachable + tricky_pointer::spin_wait_while_eq(s.my_prev, tricky_pointer(predecessor)|FLAG); + release_internal_lock(*predecessor); + } + } else { + tricky_pointer::store(s.my_prev, predecessor, std::memory_order_relaxed); + release_internal_lock(*predecessor); + tricky_pointer::spin_wait_while_eq(s.my_prev, predecessor); + predecessor = tricky_pointer::load(s.my_prev, std::memory_order_relaxed); + } + if( predecessor ) + goto waiting; + } else { + tricky_pointer::store(s.my_prev, nullptr, std::memory_order_relaxed); + } + __TBB_ASSERT( !predecessor && !s.my_prev, nullptr ); + + // additional lifetime issue prevention checks + // wait for the successor to finish working with my fields + wait_for_release_of_internal_lock(s); + // now wait for the predecessor to finish working with my fields + spin_wait_while_eq( s.my_going, 2U ); + + bool result = ( s.my_state != STATE_UPGRADE_LOSER ); + s.my_state.store(STATE_WRITER, std::memory_order_relaxed); + s.my_going.store(1U, std::memory_order_relaxed); + + ITT_NOTIFY(sync_acquired, s.my_mutex); + return result; + } + + static bool is_writer(const d1::queuing_rw_mutex::scoped_lock& m) { + return m.my_state.load(std::memory_order_relaxed) == STATE_WRITER; + } + + static void construct(d1::queuing_rw_mutex& m) { + suppress_unused_warning(m); + ITT_SYNC_CREATE(&m, _T("tbb::queuing_rw_mutex"), _T("")); + } +}; + +void __TBB_EXPORTED_FUNC acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) { + queuing_rw_mutex_impl::acquire(m, s, write); +} + +bool __TBB_EXPORTED_FUNC try_acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) { + return queuing_rw_mutex_impl::try_acquire(m, s, write); +} + +void __TBB_EXPORTED_FUNC release(d1::queuing_rw_mutex::scoped_lock& s) { + queuing_rw_mutex_impl::release(s); +} + +bool __TBB_EXPORTED_FUNC upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock& s) { + return queuing_rw_mutex_impl::upgrade_to_writer(s); +} + +bool __TBB_EXPORTED_FUNC is_writer(const d1::queuing_rw_mutex::scoped_lock& s) { + return queuing_rw_mutex_impl::is_writer(s); +} + +bool __TBB_EXPORTED_FUNC downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock& s) { + return queuing_rw_mutex_impl::downgrade_to_reader(s); +} + +void __TBB_EXPORTED_FUNC construct(d1::queuing_rw_mutex& m) { + queuing_rw_mutex_impl::construct(m); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/queuing_rw_mutex.h b/third_party/tbb/queuing_rw_mutex.h new file mode 100644 index 000000000..4c9368b1b --- /dev/null +++ b/third_party/tbb/queuing_rw_mutex.h @@ -0,0 +1,208 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_queuing_rw_mutex_H +#define __TBB_queuing_rw_mutex_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_mutex_common.h" + +#include "third_party/tbb/profiling.h" + +#include "third_party/libcxx/cstring" +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace r1 { +struct queuing_rw_mutex_impl; +} +namespace d1 { + +//! Queuing reader-writer mutex with local-only spinning. +/** Adapted from Krieger, Stumm, et al. pseudocode at + https://www.researchgate.net/publication/221083709_A_Fair_Fast_Scalable_Reader-Writer_Lock + @ingroup synchronization */ +class queuing_rw_mutex { + friend r1::queuing_rw_mutex_impl; +public: + //! Construct unacquired mutex. + queuing_rw_mutex() noexcept { + create_itt_sync(this, "tbb::queuing_rw_mutex", ""); + } + + //! Destructor asserts if the mutex is acquired, i.e. q_tail is non-null + ~queuing_rw_mutex() { + __TBB_ASSERT(q_tail.load(std::memory_order_relaxed) == nullptr, "destruction of an acquired mutex"); + } + + //! No Copy + queuing_rw_mutex(const queuing_rw_mutex&) = delete; + queuing_rw_mutex& operator=(const queuing_rw_mutex&) = delete; + + //! The scoped locking pattern + /** It helps to avoid the common problem of forgetting to release lock. + It also nicely provides the "node" for queuing locks. */ + class scoped_lock { + friend r1::queuing_rw_mutex_impl; + //! Initialize fields to mean "no lock held". + void initialize() { + my_mutex = nullptr; + my_internal_lock.store(0, std::memory_order_relaxed); + my_going.store(0, std::memory_order_relaxed); +#if TBB_USE_ASSERT + my_state = 0xFF; // Set to invalid state + my_next.store(reinterpret_cast(reinterpret_cast(-1)), std::memory_order_relaxed); + my_prev.store(reinterpret_cast(reinterpret_cast(-1)), std::memory_order_relaxed); +#endif /* TBB_USE_ASSERT */ + } + + public: + //! Construct lock that has not acquired a mutex. + /** Equivalent to zero-initialization of *this. */ + scoped_lock() {initialize();} + + //! Acquire lock on given mutex. + scoped_lock( queuing_rw_mutex& m, bool write=true ) { + initialize(); + acquire(m,write); + } + + //! Release lock (if lock is held). + ~scoped_lock() { + if( my_mutex ) release(); + } + + //! No Copy + scoped_lock(const scoped_lock&) = delete; + scoped_lock& operator=(const scoped_lock&) = delete; + + //! Acquire lock on given mutex. + void acquire( queuing_rw_mutex& m, bool write=true ); + + //! Acquire lock on given mutex if free (i.e. non-blocking) + bool try_acquire( queuing_rw_mutex& m, bool write=true ); + + //! Release lock. + void release(); + + //! Upgrade reader to become a writer. + /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ + bool upgrade_to_writer(); + + //! Downgrade writer to become a reader. + bool downgrade_to_reader(); + + bool is_writer() const; + + private: + //! The pointer to the mutex owned, or nullptr if not holding a mutex. + queuing_rw_mutex* my_mutex; + + //! The 'pointer' to the previous and next competitors for a mutex + std::atomic my_prev; + std::atomic my_next; + + using state_t = unsigned char ; + + //! State of the request: reader, writer, active reader, other service states + std::atomic my_state; + + //! The local spin-wait variable + /** Corresponds to "spin" in the pseudocode but inverted for the sake of zero-initialization */ + std::atomic my_going; + + //! A tiny internal lock + std::atomic my_internal_lock; + }; + + // Mutex traits + static constexpr bool is_rw_mutex = true; + static constexpr bool is_recursive_mutex = false; + static constexpr bool is_fair_mutex = true; + +private: + //! The last competitor requesting the lock + std::atomic q_tail{nullptr}; +}; +#if TBB_USE_PROFILING_TOOLS +inline void set_name(queuing_rw_mutex& obj, const char* name) { + itt_set_sync_name(&obj, name); +} +#if (_WIN32||_WIN64) +inline void set_name(queuing_rw_mutex& obj, const wchar_t* name) { + itt_set_sync_name(&obj, name); +} +#endif //WIN +#else +inline void set_name(queuing_rw_mutex&, const char*) {} +#if (_WIN32||_WIN64) +inline void set_name(queuing_rw_mutex&, const wchar_t*) {} +#endif //WIN +#endif +} // namespace d1 + +namespace r1 { +TBB_EXPORT void acquire(d1::queuing_rw_mutex&, d1::queuing_rw_mutex::scoped_lock&, bool); +TBB_EXPORT bool try_acquire(d1::queuing_rw_mutex&, d1::queuing_rw_mutex::scoped_lock&, bool); +TBB_EXPORT void release(d1::queuing_rw_mutex::scoped_lock&); +TBB_EXPORT bool upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock&); +TBB_EXPORT bool downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock&); +TBB_EXPORT bool is_writer(const d1::queuing_rw_mutex::scoped_lock&); +} // namespace r1 + +namespace d1 { + + +inline void queuing_rw_mutex::scoped_lock::acquire(queuing_rw_mutex& m,bool write) { + r1::acquire(m, *this, write); +} + +inline bool queuing_rw_mutex::scoped_lock::try_acquire(queuing_rw_mutex& m, bool write) { + return r1::try_acquire(m, *this, write); +} + +inline void queuing_rw_mutex::scoped_lock::release() { + r1::release(*this); +} + +inline bool queuing_rw_mutex::scoped_lock::upgrade_to_writer() { + return r1::upgrade_to_writer(*this); +} + +inline bool queuing_rw_mutex::scoped_lock::downgrade_to_reader() { + return r1::downgrade_to_reader(*this); +} + +inline bool queuing_rw_mutex::scoped_lock::is_writer() const { + return r1::is_writer(*this); +} +} // namespace d1 + +} // namespace detail + +inline namespace v1 { +using detail::d1::queuing_rw_mutex; +} // namespace v1 +namespace profiling { + using detail::d1::set_name; +} +} // namespace tbb + +#endif /* __TBB_queuing_rw_mutex_H */ diff --git a/third_party/tbb/rml_base.h b/third_party/tbb/rml_base.h new file mode 100644 index 000000000..f903c39e1 --- /dev/null +++ b/third_party/tbb/rml_base.h @@ -0,0 +1,182 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Header guard and namespace names follow rml conventions. + +#ifndef __RML_rml_base_H +#define __RML_rml_base_H + +#include "third_party/libcxx/cstddef" + +#if _WIN32||_WIN64 +#include "libc/nt/accounting.h" +#include "libc/nt/automation.h" +#include "libc/nt/console.h" +#include "libc/nt/debug.h" +#include "libc/nt/dll.h" +#include "libc/nt/enum/keyaccess.h" +#include "libc/nt/enum/regtype.h" +#include "libc/nt/errors.h" +#include "libc/nt/events.h" +#include "libc/nt/files.h" +#include "libc/nt/ipc.h" +#include "libc/nt/memory.h" +#include "libc/nt/paint.h" +#include "libc/nt/process.h" +#include "libc/nt/registry.h" +#include "libc/nt/synchronization.h" +#include "libc/nt/thread.h" +#include "libc/nt/windows.h" +#include "libc/nt/winsock.h" +#endif /* _WIN32||_WIN64 */ + +#ifdef RML_PURE_VIRTUAL_HANDLER +#define RML_PURE(T) {RML_PURE_VIRTUAL_HANDLER(); return (T)0;} +#else +#define RML_PURE(T) = 0; +#endif + +namespace rml { + +class server; + +class versioned_object { +public: + //! A version number + typedef unsigned version_type; + + virtual ~versioned_object() {} + + //! Get version of this object + /** The version number is incremented when a incompatible change is introduced. + The version number is invariant for the lifetime of the object. */ + virtual version_type version() const RML_PURE(version_type) + +}; + +//! Represents a client's job for an execution context. +/** A job object is constructed by the client. + Not derived from versioned_object because version is same as for client. */ +class job { + friend class server; +}; + +//! Information that client provides to server when asking for a server. +/** The instance must endure at least until acknowledge_close_connection is called. */ +class client: public versioned_object { +public: + //! Typedef for convenience of derived classes in other namespaces. + typedef ::rml::job job; + + //! Index of a job in a job pool + typedef unsigned size_type; + + //! Maximum number of threads that client can exploit profitably if nothing else is running on the machine. + /** The returned value should remain invariant for the lifetime of the connection. [idempotent] */ + virtual size_type max_job_count() const RML_PURE(size_type) + + //! Minimum stack size for each job. 0 means to use default stack size. [idempotent] + virtual std::size_t min_stack_size() const RML_PURE(std::size_t) + + //! Server calls this routine when it needs client to create a job object. + virtual job* create_one_job() RML_PURE(job*) + + //! Acknowledge that all jobs have been cleaned up. + /** Called by server in response to request_close_connection + after cleanup(job) has been called for each job. */ + virtual void acknowledge_close_connection() RML_PURE(void) + + //! Inform client that server is done with *this. + /** Client should destroy the job. + Not necessarily called by execution context represented by *this. + Never called while any other thread is working on the job. */ + virtual void cleanup( job& ) RML_PURE(void) + + // In general, we should not add new virtual methods, because that would + // break derived classes. Think about reserving some vtable slots. +}; + +// Information that server provides to client. +// Virtual functions are routines provided by the server for the client to call. +class server: public versioned_object { +public: + //! Typedef for convenience of derived classes. + typedef ::rml::job job; + +#if _WIN32||_WIN64 + typedef void* execution_resource_t; +#endif + + //! Request that connection to server be closed. + /** Causes each job associated with the client to have its cleanup method called, + possibly by a thread different than the thread that created the job. + This method can return before all cleanup methods return. + Actions that have to wait after all cleanup methods return should be part of + client::acknowledge_close_connection. + Pass true as exiting if request_close_connection() is called because exit() is + called. In that case, it is the client's responsibility to make sure all threads + are terminated. In all other cases, pass false. */ + virtual void request_close_connection( bool exiting = false ) = 0; + + //! Called by client thread when it reaches a point where it cannot make progress until other threads do. + virtual void yield() = 0; + + //! Called by client to indicate a change in the number of non-RML threads that are running. + /** This is a performance hint to the RML to adjust how many threads it should let run + concurrently. The delta is the change in the number of non-RML threads that are running. + For example, a value of 1 means the client has started running another thread, and a value + of -1 indicates that the client has blocked or terminated one of its threads. */ + virtual void independent_thread_number_changed( int delta ) = 0; + + //! Default level of concurrency for which RML strives when there are no non-RML threads running. + /** Normally, the value is the hardware concurrency minus one. + The "minus one" accounts for the thread created by main(). */ + virtual unsigned default_concurrency() const = 0; +}; + +class factory { +public: + //! status results + enum status_type { + st_success=0, + st_connection_exists, + st_not_found, + st_incompatible + }; + +protected: + //! Pointer to routine that waits for server to indicate when client can close itself. + status_type (*my_wait_to_close_routine)( factory& ); + +public: + //! Library handle for use by RML. +#if _WIN32||_WIN64 + HMODULE library_handle; +#else + void* library_handle; +#endif /* _WIN32||_WIN64 */ + + //! Special marker to keep dll from being unloaded prematurely + static const std::size_t c_dont_unload = 1; +}; + +//! Typedef for callback functions to print server info +typedef void (*server_info_callback_t)( void* arg, const char* server_info ); + +} // namespace rml + +#endif /* __RML_rml_base_H */ diff --git a/third_party/tbb/rml_tbb.cpp b/third_party/tbb/rml_tbb.cpp new file mode 100644 index 000000000..a08ad0ecd --- /dev/null +++ b/third_party/tbb/rml_tbb.cpp @@ -0,0 +1,113 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/detail/_assert.h" + +#include "third_party/tbb/rml_tbb.h" +#include "third_party/tbb/dynamic_link.h" + +namespace tbb { +namespace detail { +namespace r1 { +namespace rml { + +#define MAKE_SERVER(x) DLD(__TBB_make_rml_server,x) +#define GET_INFO(x) DLD(__TBB_call_with_my_server_info,x) +#define SERVER tbb_server +#define CLIENT tbb_client +#define FACTORY tbb_factory + +#if __TBB_WEAK_SYMBOLS_PRESENT + #pragma weak __TBB_make_rml_server + #pragma weak __TBB_call_with_my_server_info + extern "C" { + ::rml::factory::status_type __TBB_make_rml_server( rml::tbb_factory& f, rml::tbb_server*& server, rml::tbb_client& client ); + void __TBB_call_with_my_server_info( ::rml::server_info_callback_t cb, void* arg ); + } +#endif /* __TBB_WEAK_SYMBOLS_PRESENT */ + +#if TBB_USE_DEBUG +#define DEBUG_SUFFIX "_debug" +#else +#define DEBUG_SUFFIX +#endif /* TBB_USE_DEBUG */ + +// RML_SERVER_NAME is the name of the RML server library. +#if _WIN32 || _WIN64 +#define RML_SERVER_NAME "irml" DEBUG_SUFFIX ".dll" +#elif __APPLE__ +#define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".1.dylib" +#elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX +#define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".so" +#elif __unix__ +#define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".so.1" +#else +#error Unknown OS +#endif + +const ::rml::versioned_object::version_type CLIENT_VERSION = 2; + +#if __TBB_WEAK_SYMBOLS_PRESENT + #pragma weak __RML_open_factory + #pragma weak __RML_close_factory + extern "C" { + ::rml::factory::status_type __RML_open_factory ( ::rml::factory&, ::rml::versioned_object::version_type&, ::rml::versioned_object::version_type ); + void __RML_close_factory( ::rml::factory& f ); + } +#endif /* __TBB_WEAK_SYMBOLS_PRESENT */ + +::rml::factory::status_type FACTORY::open() { + // Failure of following assertion indicates that factory is already open, or not zero-inited. + __TBB_ASSERT_EX( !library_handle, nullptr); + status_type (*open_factory_routine)( factory&, version_type&, version_type ); + dynamic_link_descriptor server_link_table[4] = { + DLD(__RML_open_factory,open_factory_routine), + MAKE_SERVER(my_make_server_routine), + DLD(__RML_close_factory,my_wait_to_close_routine), + GET_INFO(my_call_with_server_info_routine), + }; + status_type result; + if ( dynamic_link( RML_SERVER_NAME, server_link_table, 4, &library_handle ) ) { + version_type server_version; + result = (*open_factory_routine)( *this, server_version, CLIENT_VERSION ); + // server_version can be checked here for incompatibility if necessary. + } else { + library_handle = nullptr; + result = st_not_found; + } + return result; +} + +void FACTORY::close() { + if ( library_handle ) + (*my_wait_to_close_routine)(*this); + if ( (size_t)library_handle>FACTORY::c_dont_unload ) { + dynamic_unlink(library_handle); + library_handle = nullptr; + } +} + +::rml::factory::status_type FACTORY::make_server( SERVER*& s, CLIENT& c) { + // Failure of following assertion means that factory was not successfully opened. + __TBB_ASSERT_EX( my_make_server_routine, nullptr); + return (*my_make_server_routine)(*this,s,c); +} + +} // namespace rml +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/rml_tbb.h b/third_party/tbb/rml_tbb.h new file mode 100644 index 000000000..dd571af47 --- /dev/null +++ b/third_party/tbb/rml_tbb.h @@ -0,0 +1,95 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Header guard and namespace names follow TBB conventions. + +#ifndef __TBB_rml_tbb_H +#define __TBB_rml_tbb_H + +#include "third_party/tbb/version.h" +#include "third_party/tbb/rml_base.h" + +namespace tbb { +namespace detail { +namespace r1 { +namespace rml { + +//------------------------------------------------------------------------ +// Classes instantiated by the server +//------------------------------------------------------------------------ + +//! Represents a set of oneTBB worker threads provided by the server. +class tbb_server: public ::rml::server { +public: + //! Inform server of adjustments in the number of workers that the client can profitably use. + virtual void adjust_job_count_estimate( int delta ) = 0; + +#if _WIN32 || _WIN64 + //! Inform server of a oneTBB external thread. + virtual void register_external_thread( execution_resource_t& v ) = 0; + + //! Inform server that the oneTBB external thread is done with its work. + virtual void unregister_external_thread( execution_resource_t v ) = 0; +#endif /* _WIN32||_WIN64 */ +}; + +//------------------------------------------------------------------------ +// Classes instantiated by the client +//------------------------------------------------------------------------ + +class tbb_client: public ::rml::client { +public: + //! Defined by TBB to steal a task and execute it. + /** Called by server when it wants an execution context to do some TBB work. + The method should return when it is okay for the thread to yield indefinitely. */ + virtual void process( job& ) RML_PURE(void) +}; + +/** Client must ensure that instance is zero-inited, typically by being a file-scope object. */ +class tbb_factory: public ::rml::factory { + + //! Pointer to routine that creates an RML server. + status_type (*my_make_server_routine)( tbb_factory&, tbb_server*&, tbb_client& ); + + //! Pointer to routine that calls callback function with server version info. + void (*my_call_with_server_info_routine)( ::rml::server_info_callback_t cb, void* arg ); + +public: + typedef ::rml::versioned_object::version_type version_type; + typedef tbb_client client_type; + typedef tbb_server server_type; + + //! Open factory. + /** Dynamically links against RML library. + Returns st_success, st_incompatible, or st_not_found. */ + status_type open(); + + //! Factory method to be called by client to create a server object. + /** Factory must be open. + Returns st_success, or st_incompatible . */ + status_type make_server( server_type*&, client_type& ); + + //! Close factory + void close(); +}; + +} // namespace rml +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /*__TBB_rml_tbb_H */ diff --git a/third_party/tbb/rml_thread_monitor.h b/third_party/tbb/rml_thread_monitor.h new file mode 100644 index 000000000..5b8a4d4d4 --- /dev/null +++ b/third_party/tbb/rml_thread_monitor.h @@ -0,0 +1,277 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// All platform-specific threading support is encapsulated here. */ + +#ifndef __RML_thread_monitor_H +#define __RML_thread_monitor_H + +#if __TBB_USE_WINAPI +#include "libc/nt/accounting.h" +#include "libc/nt/automation.h" +#include "libc/nt/console.h" +#include "libc/nt/debug.h" +#include "libc/nt/dll.h" +#include "libc/nt/enum/keyaccess.h" +#include "libc/nt/enum/regtype.h" +#include "libc/nt/errors.h" +#include "libc/nt/events.h" +#include "libc/nt/files.h" +#include "libc/nt/ipc.h" +#include "libc/nt/memory.h" +#include "libc/nt/paint.h" +#include "libc/nt/process.h" +#include "libc/nt/registry.h" +#include "libc/nt/synchronization.h" +#include "libc/nt/thread.h" +#include "libc/nt/windows.h" +#include "libc/nt/winsock.h" +// MISSING #include +#include "libc/mem/mem.h" //_alloca +#include "third_party/tbb/misc.h" // support for processor groups +#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00) +#include "third_party/libcxx/thread" +#endif +#elif __TBB_USE_POSIX +#include "libc/calls/weirdtypes.h" +#include "libc/sysv/consts/clock.h" +#include "libc/thread/thread.h" +#include "libc/thread/thread2.h" +#include "third_party/libcxx/cstring" +#include "third_party/libcxx/cstdlib" +#include "libc/calls/calls.h" +#include "libc/calls/struct/timespec.h" +#include "libc/calls/struct/timeval.h" +#include "libc/calls/weirdtypes.h" +#include "libc/sysv/consts/clock.h" +#include "libc/sysv/consts/sched.h" +#include "libc/sysv/consts/timer.h" +#include "libc/time/struct/tm.h" +#include "libc/time/time.h" +#else +#error Unsupported platform +#endif +#include "third_party/libcxx/cstdio" + +#include "third_party/tbb/detail/_template_helpers.h" + +#include "third_party/tbb/itt_notify.h" +#include "third_party/tbb/semaphore.h" + +// All platform-specific threading support is in this header. + +#if (_WIN32||_WIN64)&&!__TBB_ipf +// Deal with 64K aliasing. The formula for "offset" is a Fibonacci hash function, +// which has the desirable feature of spreading out the offsets fairly evenly +// without knowing the total number of offsets, and furthermore unlikely to +// accidentally cancel out other 64K aliasing schemes that Microsoft might implement later. +// See Knuth Vol 3. "Theorem S" for details on Fibonacci hashing. +// The second statement is really does need "volatile", otherwise the compiler might remove the _alloca. +#define AVOID_64K_ALIASING(idx) \ + std::size_t offset = (idx+1) * 40503U % (1U<<16); \ + void* volatile sink_for_alloca = _alloca(offset); \ + __TBB_ASSERT_EX(sink_for_alloca, "_alloca failed"); +#else +// Linux thread allocators avoid 64K aliasing. +#define AVOID_64K_ALIASING(idx) tbb::detail::suppress_unused_warning(idx) +#endif /* _WIN32||_WIN64 */ + +namespace tbb { +namespace detail { +namespace r1 { + +// Forward declaration: throws std::runtime_error with what() returning error_code description prefixed with aux_info +void handle_perror(int error_code, const char* aux_info); + +namespace rml { +namespace internal { + +#if __TBB_USE_ITT_NOTIFY +static const ::tbb::detail::r1::tchar *SyncType_RML = _T("%Constant"); +static const ::tbb::detail::r1::tchar *SyncObj_ThreadMonitor = _T("RML Thr Monitor"); +#endif /* __TBB_USE_ITT_NOTIFY */ + +//! Monitor with limited two-phase commit form of wait. +/** At most one thread should wait on an instance at a time. */ +class thread_monitor { +public: + thread_monitor() { + ITT_SYNC_CREATE(&my_sema, SyncType_RML, SyncObj_ThreadMonitor); + } + ~thread_monitor() {} + + //! Notify waiting thread + /** Can be called by any thread. */ + void notify(); + + //! Wait for notification + void wait(); + +#if __TBB_USE_WINAPI + typedef HANDLE handle_type; + + #define __RML_DECL_THREAD_ROUTINE unsigned WINAPI + typedef unsigned (WINAPI *thread_routine_type)(void*); + + //! Launch a thread + static handle_type launch( thread_routine_type thread_routine, void* arg, std::size_t stack_size, const size_t* worker_index = nullptr ); + +#elif __TBB_USE_POSIX + typedef pthread_t handle_type; + + #define __RML_DECL_THREAD_ROUTINE void* + typedef void*(*thread_routine_type)(void*); + + //! Launch a thread + static handle_type launch( thread_routine_type thread_routine, void* arg, std::size_t stack_size ); +#endif /* __TBB_USE_POSIX */ + + //! Join thread + static void join(handle_type handle); + + //! Detach thread + static void detach_thread(handle_type handle); +private: + // The protection from double notification of the binary semaphore + std::atomic my_notified{ false }; + binary_semaphore my_sema; +#if __TBB_USE_POSIX + static void check( int error_code, const char* routine ); +#endif +}; + +#if __TBB_USE_WINAPI + +#ifndef STACK_SIZE_PARAM_IS_A_RESERVATION +#define STACK_SIZE_PARAM_IS_A_RESERVATION 0x00010000 +#endif + +// _beginthreadex API is not available in Windows 8 Store* applications, so use std::thread instead +#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00) +inline thread_monitor::handle_type thread_monitor::launch( thread_routine_type thread_function, void* arg, std::size_t, const std::size_t*) { +//TODO: check that exception thrown from std::thread is not swallowed silently + std::thread* thread_tmp=new std::thread(thread_function, arg); + return thread_tmp->native_handle(); +} +#else +inline thread_monitor::handle_type thread_monitor::launch( thread_routine_type thread_routine, void* arg, std::size_t stack_size, const std::size_t* worker_index ) { + unsigned thread_id; + int number_of_processor_groups = ( worker_index ) ? NumberOfProcessorGroups() : 0; + unsigned create_flags = ( number_of_processor_groups > 1 ) ? CREATE_SUSPENDED : 0; + HANDLE h = (HANDLE)_beginthreadex( nullptr, unsigned(stack_size), thread_routine, arg, STACK_SIZE_PARAM_IS_A_RESERVATION | create_flags, &thread_id ); + if( !h ) { + handle_perror(0, "thread_monitor::launch: _beginthreadex failed\n"); + } + if ( number_of_processor_groups > 1 ) { + MoveThreadIntoProcessorGroup( h, FindProcessorGroupIndex( static_cast(*worker_index) ) ); + ResumeThread( h ); + } + return h; +} +#endif //__TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00) + +void thread_monitor::join(handle_type handle) { +#if TBB_USE_ASSERT + DWORD res = +#endif + WaitForSingleObjectEx(handle, INFINITE, FALSE); + __TBB_ASSERT( res==WAIT_OBJECT_0, nullptr); +#if TBB_USE_ASSERT + BOOL val = +#endif + CloseHandle(handle); + __TBB_ASSERT( val, nullptr); +} + +void thread_monitor::detach_thread(handle_type handle) { +#if TBB_USE_ASSERT + BOOL val = +#endif + CloseHandle(handle); + __TBB_ASSERT( val, nullptr); +} + +#endif /* __TBB_USE_WINAPI */ + +#if __TBB_USE_POSIX +inline void thread_monitor::check( int error_code, const char* routine ) { + if( error_code ) { + handle_perror(error_code, routine); + } +} + +inline thread_monitor::handle_type thread_monitor::launch( void* (*thread_routine)(void*), void* arg, std::size_t stack_size ) { + // FIXME - consider more graceful recovery than just exiting if a thread cannot be launched. + // Note that there are some tricky situations to deal with, such that the thread is already + // grabbed as part of an OpenMP team. + pthread_attr_t s; + check(pthread_attr_init( &s ), "pthread_attr_init has failed"); + if( stack_size>0 ) + check(pthread_attr_setstacksize( &s, stack_size ), "pthread_attr_setstack_size has failed" ); + + // pthread_create(2) can spuriously fail with EAGAIN. We retry + // max_num_tries times with progressively longer wait times. + pthread_t handle; + const int max_num_tries = 20; + int error = EAGAIN; + + for (int i = 0; i < max_num_tries && error == EAGAIN; i++) { + if (i != 0) { + // Wait i milliseconds + struct timespec ts = {0, i * 1000 * 1000}; + nanosleep(&ts, NULL); + } + error = pthread_create(&handle, &s, thread_routine, arg); + } + + if (error) + handle_perror(error, "pthread_create has failed"); + + check( pthread_attr_destroy( &s ), "pthread_attr_destroy has failed" ); + return handle; +} + +void thread_monitor::join(handle_type handle) { + check(pthread_join(handle, nullptr), "pthread_join has failed"); +} + +void thread_monitor::detach_thread(handle_type handle) { + check(pthread_detach(handle), "pthread_detach has failed"); +} +#endif /* __TBB_USE_POSIX */ + +inline void thread_monitor::notify() { + // Check that the semaphore is not notified twice + if (!my_notified.exchange(true, std::memory_order_release)) { + my_sema.V(); + } +} + +inline void thread_monitor::wait() { + my_sema.P(); + // memory_order_seq_cst is required here to be ordered with + // further load checking shutdown state + my_notified.store(false, std::memory_order_seq_cst); +} + +} // namespace internal +} // namespace rml +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* __RML_thread_monitor_H */ diff --git a/third_party/tbb/rtm_mutex.cpp b/third_party/tbb/rtm_mutex.cpp new file mode 100644 index 000000000..04328689a --- /dev/null +++ b/third_party/tbb/rtm_mutex.cpp @@ -0,0 +1,122 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_rtm_mutex.h" +#include "third_party/tbb/itt_notify.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/misc.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace r1 { + + +struct rtm_mutex_impl { + // maximum number of times to retry + // TODO: experiment on retry values. + static constexpr int retry_threshold = 10; + using transaction_result_type = decltype(begin_transaction()); + + //! Release speculative mutex + static void release(d1::rtm_mutex::scoped_lock& s) { + switch(s.m_transaction_state) { + case d1::rtm_mutex::rtm_state::rtm_transacting: + __TBB_ASSERT(is_in_transaction(), "m_transaction_state && not speculating"); + end_transaction(); + s.m_mutex = nullptr; + break; + case d1::rtm_mutex::rtm_state::rtm_real: + s.m_mutex->unlock(); + s.m_mutex = nullptr; + break; + case d1::rtm_mutex::rtm_state::rtm_none: + __TBB_ASSERT(false, "mutex is not locked, but in release"); + break; + default: + __TBB_ASSERT(false, "invalid m_transaction_state"); + } + s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_none; + } + + //! Acquire lock on the given mutex. + static void acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s, bool only_speculate) { + __TBB_ASSERT(s.m_transaction_state == d1::rtm_mutex::rtm_state::rtm_none, "scoped_lock already in transaction"); + if(governor::speculation_enabled()) { + int num_retries = 0; + transaction_result_type abort_code = 0; + do { + if(m.m_flag.load(std::memory_order_acquire)) { + if(only_speculate) return; + spin_wait_while_eq(m.m_flag, true); + } + // _xbegin returns -1 on success or the abort code, so capture it + if((abort_code = begin_transaction()) == transaction_result_type(speculation_successful_begin)) + { + // started speculation + if(m.m_flag.load(std::memory_order_relaxed)) { + abort_transaction(); + } + s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_transacting; + // Don not wrap the following assignment to a function, + // because it can abort the transaction in debug. Need mutex for release(). + s.m_mutex = &m; + return; // successfully started speculation + } + ++num_retries; + } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold)); + } + + if(only_speculate) return; + s.m_mutex = &m; + s.m_mutex->lock(); + s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_real; + } + + //! Try to acquire lock on the given mutex. + static bool try_acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s) { + acquire(m, s, /*only_speculate=*/true); + if (s.m_transaction_state == d1::rtm_mutex::rtm_state::rtm_transacting) { + return true; + } + __TBB_ASSERT(s.m_transaction_state == d1::rtm_mutex::rtm_state::rtm_none, nullptr); + // transacting acquire failed. try_lock the real mutex + if (m.try_lock()) { + s.m_mutex = &m; + s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_real; + return true; + } + return false; + } +}; + +void __TBB_EXPORTED_FUNC acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s, bool only_speculate) { + rtm_mutex_impl::acquire(m, s, only_speculate); +} +bool __TBB_EXPORTED_FUNC try_acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s) { + return rtm_mutex_impl::try_acquire(m, s); +} +void __TBB_EXPORTED_FUNC release(d1::rtm_mutex::scoped_lock& s) { + rtm_mutex_impl::release(s); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb + diff --git a/third_party/tbb/rtm_rw_mutex.cpp b/third_party/tbb/rtm_rw_mutex.cpp new file mode 100644 index 000000000..9e57652e6 --- /dev/null +++ b/third_party/tbb/rtm_rw_mutex.cpp @@ -0,0 +1,272 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_rtm_rw_mutex.h" +#include "third_party/tbb/itt_notify.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/misc.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace r1 { + +struct rtm_rw_mutex_impl { + // maximum number of times to retry + // TODO: experiment on retry values. + static constexpr int retry_threshold_read = 10; + static constexpr int retry_threshold_write = 10; + using transaction_result_type = decltype(begin_transaction()); + + //! Release speculative mutex + static void release(d1::rtm_rw_mutex::scoped_lock& s) { + switch(s.m_transaction_state) { + case d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer: + case d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader: + __TBB_ASSERT(is_in_transaction(), "m_transaction_state && not speculating"); + end_transaction(); + s.m_mutex = nullptr; + break; + case d1::rtm_rw_mutex::rtm_type::rtm_real_reader: + __TBB_ASSERT(!s.m_mutex->write_flag.load(std::memory_order_relaxed), "write_flag set but read lock acquired"); + s.m_mutex->unlock_shared(); + s.m_mutex = nullptr; + break; + case d1::rtm_rw_mutex::rtm_type::rtm_real_writer: + __TBB_ASSERT(s.m_mutex->write_flag.load(std::memory_order_relaxed), "write_flag unset but write lock acquired"); + s.m_mutex->write_flag.store(false, std::memory_order_relaxed); + s.m_mutex->unlock(); + s.m_mutex = nullptr; + break; + case d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex: + __TBB_ASSERT(false, "rtm_not_in_mutex, but in release"); + break; + default: + __TBB_ASSERT(false, "invalid m_transaction_state"); + } + s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex; + } + + //! Acquire write lock on the given mutex. + static void acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) { + __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, "scoped_lock already in transaction"); + if(governor::speculation_enabled()) { + int num_retries = 0; + transaction_result_type abort_code = 0; + do { + if(m.m_state.load(std::memory_order_acquire)) { + if(only_speculate) return; + spin_wait_until_eq(m.m_state, d1::rtm_rw_mutex::state_type(0)); + } + // _xbegin returns -1 on success or the abort code, so capture it + if((abort_code = begin_transaction()) == transaction_result_type(speculation_successful_begin)) + { + // started speculation + if(m.m_state.load(std::memory_order_relaxed)) { // add spin_rw_mutex to read-set. + // reader or writer grabbed the lock, so abort. + abort_transaction(); + } + s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer; + // Don not wrap the following assignment to a function, + // because it can abort the transaction in debug. Need mutex for release(). + s.m_mutex = &m; + return; // successfully started speculation + } + ++num_retries; + } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold_write)); + } + + if(only_speculate) return; + s.m_mutex = &m; // should apply a real try_lock... + s.m_mutex->lock(); // kill transactional writers + __TBB_ASSERT(!m.write_flag.load(std::memory_order_relaxed), "After acquire for write, write_flag already true"); + m.write_flag.store(true, std::memory_order_relaxed); // kill transactional readers + s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer; + } + + //! Acquire read lock on given mutex. + // only_speculate : true if we are doing a try_acquire. If true and we fail to speculate, don't + // really acquire the lock, return and do a try_acquire on the contained spin_rw_mutex. If + // the lock is already held by a writer, just return. + static void acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) { + __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, "scoped_lock already in transaction"); + if(governor::speculation_enabled()) { + int num_retries = 0; + transaction_result_type abort_code = 0; + do { + // if in try_acquire, and lock is held as writer, don't attempt to speculate. + if(m.write_flag.load(std::memory_order_acquire)) { + if(only_speculate) return; + spin_wait_while_eq(m.write_flag, true); + } + // _xbegin returns -1 on success or the abort code, so capture it + if((abort_code = begin_transaction()) == transaction_result_type(speculation_successful_begin)) + { + // started speculation + if(m.write_flag.load(std::memory_order_relaxed)) { // add write_flag to read-set. + abort_transaction(); // writer grabbed the lock, so abort. + } + s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader; + // Don not wrap the following assignment to a function, + // because it can abort the transaction in debug. Need mutex for release(). + s.m_mutex = &m; + return; // successfully started speculation + } + // fallback path + // retry only if there is any hope of getting into a transaction soon + // Retry in the following cases (from Section 8.3.5 of + // Intel(R) Architecture Instruction Set Extensions Programming Reference): + // 1. abort caused by XABORT instruction (bit 0 of EAX register is set) + // 2. the transaction may succeed on a retry (bit 1 of EAX register is set) + // 3. if another logical processor conflicted with a memory address + // that was part of the transaction that aborted (bit 2 of EAX register is set) + // That is, retry if (abort_code & 0x7) is non-zero + ++num_retries; + } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold_read)); + } + + if(only_speculate) return; + s.m_mutex = &m; + s.m_mutex->lock_shared(); + s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader; + } + + //! Upgrade reader to become a writer. + /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ + static bool upgrade(d1::rtm_rw_mutex::scoped_lock& s) { + switch(s.m_transaction_state) { + case d1::rtm_rw_mutex::rtm_type::rtm_real_reader: { + s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer; + bool no_release = s.m_mutex->upgrade(); + __TBB_ASSERT(!s.m_mutex->write_flag.load(std::memory_order_relaxed), "After upgrade, write_flag already true"); + s.m_mutex->write_flag.store(true, std::memory_order_relaxed); + return no_release; + } + case d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader: { + d1::rtm_rw_mutex& m = *s.m_mutex; + if(m.m_state.load(std::memory_order_acquire)) { // add spin_rw_mutex to read-set. + // Real reader or writer holds the lock; so commit the read and re-acquire for write. + release(s); + acquire_writer(m, s, false); + return false; + } else + { + s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer; + return true; + } + } + default: + __TBB_ASSERT(false, "Invalid state for upgrade"); + return false; + } + } + + //! Downgrade writer to a reader. + static bool downgrade(d1::rtm_rw_mutex::scoped_lock& s) { + switch (s.m_transaction_state) { + case d1::rtm_rw_mutex::rtm_type::rtm_real_writer: + s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader; + __TBB_ASSERT(s.m_mutex->write_flag.load(std::memory_order_relaxed), "Before downgrade write_flag not true"); + s.m_mutex->write_flag.store(false, std::memory_order_relaxed); + s.m_mutex->downgrade(); + return true; + case d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer: + s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader; + return true; + default: + __TBB_ASSERT(false, "Invalid state for downgrade"); + return false; + } + } + + //! Try to acquire write lock on the given mutex. + // There may be reader(s) which acquired the spin_rw_mutex, as well as possibly + // transactional reader(s). If this is the case, the acquire will fail, and assigning + // write_flag will kill the transactors. So we only assign write_flag if we have successfully + // acquired the lock. + static bool try_acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) { + acquire_writer(m, s, /*only_speculate=*/true); + if (s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer) { + return true; + } + __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, nullptr); + // transacting write acquire failed. try_lock the real mutex + if (m.try_lock()) { + s.m_mutex = &m; + // only shoot down readers if we're not transacting ourselves + __TBB_ASSERT(!m.write_flag.load(std::memory_order_relaxed), "After try_acquire_writer, write_flag already true"); + m.write_flag.store(true, std::memory_order_relaxed); + s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer; + return true; + } + return false; + } + + //! Try to acquire read lock on the given mutex. + static bool try_acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) { + // speculatively acquire the lock. If this fails, do try_lock_shared on the spin_rw_mutex. + acquire_reader(m, s, /*only_speculate=*/true); + if (s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader) { + return true; + } + __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, nullptr); + // transacting read acquire failed. try_lock_shared the real mutex + if (m.try_lock_shared()) { + s.m_mutex = &m; + s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader; + return true; + } + return false; + } +}; + +void __TBB_EXPORTED_FUNC acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) { + rtm_rw_mutex_impl::acquire_writer(m, s, only_speculate); +} +//! Internal acquire read lock. +// only_speculate == true if we're doing a try_lock, else false. +void __TBB_EXPORTED_FUNC acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) { + rtm_rw_mutex_impl::acquire_reader(m, s, only_speculate); +} +//! Internal upgrade reader to become a writer. +bool __TBB_EXPORTED_FUNC upgrade(d1::rtm_rw_mutex::scoped_lock& s) { + return rtm_rw_mutex_impl::upgrade(s); +} +//! Internal downgrade writer to become a reader. +bool __TBB_EXPORTED_FUNC downgrade(d1::rtm_rw_mutex::scoped_lock& s) { + return rtm_rw_mutex_impl::downgrade(s); +} +//! Internal try_acquire write lock. +bool __TBB_EXPORTED_FUNC try_acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) { + return rtm_rw_mutex_impl::try_acquire_writer(m, s); +} +//! Internal try_acquire read lock. +bool __TBB_EXPORTED_FUNC try_acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) { + return rtm_rw_mutex_impl::try_acquire_reader(m, s); +} +//! Internal release lock. +void __TBB_EXPORTED_FUNC release(d1::rtm_rw_mutex::scoped_lock& s) { + rtm_rw_mutex_impl::release(s); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb + + diff --git a/third_party/tbb/rw_mutex.h b/third_party/tbb/rw_mutex.h new file mode 100644 index 000000000..d156a0c60 --- /dev/null +++ b/third_party/tbb/rw_mutex.h @@ -0,0 +1,217 @@ +// clang-format off +/* + Copyright (c) 2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_rw_mutex_H +#define __TBB_rw_mutex_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_waitable_atomic.h" +#include "third_party/tbb/detail/_scoped_lock.h" +#include "third_party/tbb/detail/_mutex_common.h" +#include "third_party/tbb/profiling.h" + +namespace tbb { +namespace detail { +namespace d1 { + +class rw_mutex { +public: + //! Constructors + rw_mutex() noexcept : m_state(0) { + create_itt_sync(this, "tbb::rw_mutex", ""); + } + + //! Destructor + ~rw_mutex() { + __TBB_ASSERT(!m_state.load(std::memory_order_relaxed), "destruction of an acquired mutex"); + } + + //! No Copy + rw_mutex(const rw_mutex&) = delete; + rw_mutex& operator=(const rw_mutex&) = delete; + + using scoped_lock = rw_scoped_lock; + + //! Mutex traits + static constexpr bool is_rw_mutex = true; + static constexpr bool is_recursive_mutex = false; + static constexpr bool is_fair_mutex = false; + + //! Acquire lock + void lock() { + call_itt_notify(prepare, this); + while (!try_lock()) { + if (!(m_state.load(std::memory_order_relaxed) & WRITER_PENDING)) { // no pending writers + m_state |= WRITER_PENDING; + } + + auto wakeup_condition = [&] { return !(m_state.load(std::memory_order_relaxed) & BUSY); }; + adaptive_wait_on_address(this, wakeup_condition, WRITER_CONTEXT); + } + + call_itt_notify(acquired, this); + } + + //! Try acquiring lock (non-blocking) + /** Return true if lock acquired; false otherwise. */ + bool try_lock() { + // for a writer: only possible to acquire if no active readers or writers + // Use relaxed memory fence is OK here because + // Acquire memory fence guaranteed by compare_exchange_strong() + state_type s = m_state.load(std::memory_order_relaxed); + if (!(s & BUSY)) { // no readers, no writers; mask is 1..1101 + if (m_state.compare_exchange_strong(s, WRITER)) { + call_itt_notify(acquired, this); + return true; // successfully stored writer flag + } + } + return false; + } + + //! Release lock + void unlock() { + call_itt_notify(releasing, this); + state_type curr_state = (m_state &= READERS | WRITER_PENDING); // Returns current state + + if (curr_state & WRITER_PENDING) { + r1::notify_by_address(this, WRITER_CONTEXT); + } else { + // It's possible that WRITER sleeps without WRITER_PENDING, + // because other thread might clear this bit at upgrade() + r1::notify_by_address_all(this); + } + } + + //! Lock shared ownership mutex + void lock_shared() { + call_itt_notify(prepare, this); + while (!try_lock_shared()) { + state_type has_writer = WRITER | WRITER_PENDING; + auto wakeup_condition = [&] { return !(m_state.load(std::memory_order_relaxed) & has_writer); }; + adaptive_wait_on_address(this, wakeup_condition, READER_CONTEXT); + } + __TBB_ASSERT(m_state.load(std::memory_order_relaxed) & READERS, "invalid state of a read lock: no readers"); + } + + //! Try lock shared ownership mutex + bool try_lock_shared() { + // for a reader: acquire if no active or waiting writers + // Use relaxed memory fence is OK here because + // Acquire memory fence guaranteed by fetch_add() + state_type has_writer = WRITER | WRITER_PENDING; + if (!(m_state.load(std::memory_order_relaxed) & has_writer)) { + if (m_state.fetch_add(ONE_READER) & has_writer) { + m_state -= ONE_READER; + r1::notify_by_address(this, WRITER_CONTEXT); + } else { + call_itt_notify(acquired, this); + return true; // successfully stored increased number of readers + } + } + return false; + } + + //! Unlock shared ownership mutex + void unlock_shared() { + __TBB_ASSERT(m_state.load(std::memory_order_relaxed) & READERS, "invalid state of a read lock: no readers"); + call_itt_notify(releasing, this); + + state_type curr_state = (m_state -= ONE_READER); // Returns current state + + if (curr_state & (WRITER_PENDING)) { + r1::notify_by_address(this, WRITER_CONTEXT); + } else { + // It's possible that WRITER sleeps without WRITER_PENDING, + // because other thread might clear this bit at upgrade() + r1::notify_by_address_all(this); + } + } + +private: + /** Internal non ISO C++ standard API **/ + //! This API is used through the scoped_lock class + + //! Upgrade reader to become a writer. + /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ + bool upgrade() { + state_type s = m_state.load(std::memory_order_relaxed); + __TBB_ASSERT(s & READERS, "invalid state before upgrade: no readers "); + // Check and set writer-pending flag. + // Required conditions: either no pending writers, or we are the only reader + // (with multiple readers and pending writer, another upgrade could have been requested) + while ((s & READERS) == ONE_READER || !(s & WRITER_PENDING)) { + if (m_state.compare_exchange_strong(s, s | WRITER | WRITER_PENDING)) { + auto wakeup_condition = [&] { return (m_state.load(std::memory_order_relaxed) & READERS) == ONE_READER; }; + while ((m_state.load(std::memory_order_relaxed) & READERS) != ONE_READER) { + adaptive_wait_on_address(this, wakeup_condition, WRITER_CONTEXT); + } + + __TBB_ASSERT((m_state.load(std::memory_order_relaxed) & (WRITER_PENDING|WRITER)) == (WRITER_PENDING | WRITER), + "invalid state when upgrading to writer"); + // Both new readers and writers are blocked at this time + m_state -= (ONE_READER + WRITER_PENDING); + return true; // successfully upgraded + } + } + // Slow reacquire + unlock_shared(); + lock(); + return false; + } + + //! Downgrade writer to a reader + void downgrade() { + __TBB_ASSERT(m_state.load(std::memory_order_relaxed) & WRITER, nullptr), + call_itt_notify(releasing, this); + m_state += (ONE_READER - WRITER); + + if (!(m_state & WRITER_PENDING)) { + r1::notify_by_address(this, READER_CONTEXT); + } + + __TBB_ASSERT(m_state.load(std::memory_order_relaxed) & READERS, "invalid state after downgrade: no readers"); + } + + using state_type = std::intptr_t; + static constexpr state_type WRITER = 1; + static constexpr state_type WRITER_PENDING = 2; + static constexpr state_type READERS = ~(WRITER | WRITER_PENDING); + static constexpr state_type ONE_READER = 4; + static constexpr state_type BUSY = WRITER | READERS; + + using context_type = std::uintptr_t; + static constexpr context_type WRITER_CONTEXT = 0; + static constexpr context_type READER_CONTEXT = 1; + friend scoped_lock; + //! State of lock + /** Bit 0 = writer is holding lock + Bit 1 = request by a writer to acquire lock (hint to readers to wait) + Bit 2..N = number of readers holding lock */ + std::atomic m_state; +}; // class rw_mutex + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::rw_mutex; +} // namespace v1 + +} // namespace tbb + +#endif // __TBB_rw_mutex_H diff --git a/third_party/tbb/scalable_allocator.h b/third_party/tbb/scalable_allocator.h new file mode 100644 index 000000000..d6f6b9c60 --- /dev/null +++ b/third_party/tbb/scalable_allocator.h @@ -0,0 +1,338 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_scalable_allocator_H +#define __TBB_scalable_allocator_H + +#ifdef __cplusplus +// MISSING #include "oneapi/tbb/detail/_config.h" +#include "third_party/tbb/detail/_utils.h" +// MISSING #include "oneapi/tbb/detail/_namespace_injection.h" +#include "third_party/libcxx/cstdlib" +#include "third_party/libcxx/utility" +#include "third_party/libcxx/new" /* std::bad_alloc() */ +#else +// MISSING #include "oneapi/tbb/detail/_export.h" + /* Need ptrdiff_t and size_t from here. */ +#if !defined(_MSC_VER) || defined(__clang__) +#include "libc/inttypes.h" +#include "libc/limits.h" +#include "libc/literal.h" /* Need intptr_t from here. */ +#endif +#endif + +#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT +// MISSING #include +#endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#if _MSC_VER + #define __TBB_EXPORTED_FUNC __cdecl +#else + #define __TBB_EXPORTED_FUNC +#endif + +/** The "malloc" analogue to allocate block of memory of size bytes. + * @ingroup memory_allocation */ +TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_malloc(size_t size); + +/** The "free" analogue to discard a previously allocated piece of memory. + @ingroup memory_allocation */ +TBBMALLOC_EXPORT void __TBB_EXPORTED_FUNC scalable_free(void* ptr); + +/** The "realloc" analogue complementing scalable_malloc. + @ingroup memory_allocation */ +TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_realloc(void* ptr, size_t size); + +/** The "calloc" analogue complementing scalable_malloc. + @ingroup memory_allocation */ +TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_calloc(size_t nobj, size_t size); + +/** The "posix_memalign" analogue. + @ingroup memory_allocation */ +TBBMALLOC_EXPORT int __TBB_EXPORTED_FUNC scalable_posix_memalign(void** memptr, size_t alignment, size_t size); + +/** The "_aligned_malloc" analogue. + @ingroup memory_allocation */ +TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_aligned_malloc(size_t size, size_t alignment); + +/** The "_aligned_realloc" analogue. + @ingroup memory_allocation */ +TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_aligned_realloc(void* ptr, size_t size, size_t alignment); + +/** The "_aligned_free" analogue. + @ingroup memory_allocation */ +TBBMALLOC_EXPORT void __TBB_EXPORTED_FUNC scalable_aligned_free(void* ptr); + +/** The analogue of _msize/malloc_size/malloc_usable_size. + Returns the usable size of a memory block previously allocated by scalable_*, + or 0 (zero) if ptr does not point to such a block. + @ingroup memory_allocation */ +TBBMALLOC_EXPORT size_t __TBB_EXPORTED_FUNC scalable_msize(void* ptr); + +/* Results for scalable_allocation_* functions */ +typedef enum { + TBBMALLOC_OK, + TBBMALLOC_INVALID_PARAM, + TBBMALLOC_UNSUPPORTED, + TBBMALLOC_NO_MEMORY, + TBBMALLOC_NO_EFFECT +} ScalableAllocationResult; + +/* Setting TBB_MALLOC_USE_HUGE_PAGES environment variable to 1 enables huge pages. + scalable_allocation_mode call has priority over environment variable. */ +typedef enum { + TBBMALLOC_USE_HUGE_PAGES, /* value turns using huge pages on and off */ + /* deprecated, kept for backward compatibility only */ + USE_HUGE_PAGES = TBBMALLOC_USE_HUGE_PAGES, + /* try to limit memory consumption value (Bytes), clean internal buffers + if limit is exceeded, but not prevents from requesting memory from OS */ + TBBMALLOC_SET_SOFT_HEAP_LIMIT, + /* Lower bound for the size (Bytes), that is interpreted as huge + * and not released during regular cleanup operations. */ + TBBMALLOC_SET_HUGE_SIZE_THRESHOLD +} AllocationModeParam; + +/** Set TBB allocator-specific allocation modes. + @ingroup memory_allocation */ +TBBMALLOC_EXPORT int __TBB_EXPORTED_FUNC scalable_allocation_mode(int param, intptr_t value); + +typedef enum { + /* Clean internal allocator buffers for all threads. + Returns TBBMALLOC_NO_EFFECT if no buffers cleaned, + TBBMALLOC_OK if some memory released from buffers. */ + TBBMALLOC_CLEAN_ALL_BUFFERS, + /* Clean internal allocator buffer for current thread only. + Return values same as for TBBMALLOC_CLEAN_ALL_BUFFERS. */ + TBBMALLOC_CLEAN_THREAD_BUFFERS +} ScalableAllocationCmd; + +/** Call TBB allocator-specific commands. + @ingroup memory_allocation */ +TBBMALLOC_EXPORT int __TBB_EXPORTED_FUNC scalable_allocation_command(int cmd, void *param); + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#ifdef __cplusplus + +//! The namespace rml contains components of low-level memory pool interface. +namespace rml { +class MemoryPool; + +typedef void *(*rawAllocType)(std::intptr_t pool_id, std::size_t &bytes); +// returns non-zero in case of error +typedef int (*rawFreeType)(std::intptr_t pool_id, void* raw_ptr, std::size_t raw_bytes); + +struct MemPoolPolicy { + enum { + TBBMALLOC_POOL_VERSION = 1 + }; + + rawAllocType pAlloc; + rawFreeType pFree; + // granularity of pAlloc allocations. 0 means default used. + std::size_t granularity; + int version; + // all memory consumed at 1st pAlloc call and never returned, + // no more pAlloc calls after 1st + unsigned fixedPool : 1, + // memory consumed but returned only at pool termination + keepAllMemory : 1, + reserved : 30; + + MemPoolPolicy(rawAllocType pAlloc_, rawFreeType pFree_, + std::size_t granularity_ = 0, bool fixedPool_ = false, + bool keepAllMemory_ = false) : + pAlloc(pAlloc_), pFree(pFree_), granularity(granularity_), version(TBBMALLOC_POOL_VERSION), + fixedPool(fixedPool_), keepAllMemory(keepAllMemory_), + reserved(0) {} +}; + +// enums have same values as appropriate enums from ScalableAllocationResult +// TODO: use ScalableAllocationResult in pool_create directly +enum MemPoolError { + // pool created successfully + POOL_OK = TBBMALLOC_OK, + // invalid policy parameters found + INVALID_POLICY = TBBMALLOC_INVALID_PARAM, + // requested pool policy is not supported by allocator library + UNSUPPORTED_POLICY = TBBMALLOC_UNSUPPORTED, + // lack of memory during pool creation + NO_MEMORY = TBBMALLOC_NO_MEMORY, + // action takes no effect + NO_EFFECT = TBBMALLOC_NO_EFFECT +}; + +TBBMALLOC_EXPORT MemPoolError pool_create_v1(std::intptr_t pool_id, const MemPoolPolicy *policy, + rml::MemoryPool **pool); + +TBBMALLOC_EXPORT bool pool_destroy(MemoryPool* memPool); +TBBMALLOC_EXPORT void *pool_malloc(MemoryPool* memPool, std::size_t size); +TBBMALLOC_EXPORT void *pool_realloc(MemoryPool* memPool, void *object, std::size_t size); +TBBMALLOC_EXPORT void *pool_aligned_malloc(MemoryPool* mPool, std::size_t size, std::size_t alignment); +TBBMALLOC_EXPORT void *pool_aligned_realloc(MemoryPool* mPool, void *ptr, std::size_t size, std::size_t alignment); +TBBMALLOC_EXPORT bool pool_reset(MemoryPool* memPool); +TBBMALLOC_EXPORT bool pool_free(MemoryPool *memPool, void *object); +TBBMALLOC_EXPORT MemoryPool *pool_identify(void *object); +TBBMALLOC_EXPORT std::size_t pool_msize(MemoryPool *memPool, void *object); + +} // namespace rml + +namespace tbb { +namespace detail { +namespace d1 { + +// keep throw in a separate function to prevent code bloat +template +void throw_exception(const E &e) { +#if TBB_USE_EXCEPTIONS + throw e; +#else + suppress_unused_warning(e); +#endif +} + +template +class scalable_allocator { +public: + using value_type = T; + using propagate_on_container_move_assignment = std::true_type; + + //! Always defined for TBB containers + using is_always_equal = std::true_type; + + scalable_allocator() = default; + template scalable_allocator(const scalable_allocator&) noexcept {} + + //! Allocate space for n objects. + __TBB_nodiscard T* allocate(std::size_t n) { + T* p = static_cast(scalable_malloc(n * sizeof(value_type))); + if (!p) { + throw_exception(std::bad_alloc()); + } + return p; + } + + //! Free previously allocated block of memory + void deallocate(T* p, std::size_t) { + scalable_free(p); + } + +#if TBB_ALLOCATOR_TRAITS_BROKEN + using pointer = value_type*; + using const_pointer = const value_type*; + using reference = value_type&; + using const_reference = const value_type&; + using difference_type = std::ptrdiff_t; + using size_type = std::size_t; + template struct rebind { + using other = scalable_allocator; + }; + //! Largest value for which method allocate might succeed. + size_type max_size() const noexcept { + size_type absolutemax = static_cast(-1) / sizeof (value_type); + return (absolutemax > 0 ? absolutemax : 1); + } + template + void construct(U *p, Args&&... args) + { ::new((void *)p) U(std::forward(args)...); } + void destroy(pointer p) { p->~value_type(); } + pointer address(reference x) const { return &x; } + const_pointer address(const_reference x) const { return &x; } +#endif // TBB_ALLOCATOR_TRAITS_BROKEN + +}; + +#if TBB_ALLOCATOR_TRAITS_BROKEN + template<> + class scalable_allocator { + public: + using pointer = void*; + using const_pointer = const void*; + using value_type = void; + template struct rebind { + using other = scalable_allocator; + }; + }; +#endif + +template +inline bool operator==(const scalable_allocator&, const scalable_allocator&) noexcept { return true; } + +#if !__TBB_CPP20_COMPARISONS_PRESENT +template +inline bool operator!=(const scalable_allocator&, const scalable_allocator&) noexcept { return false; } +#endif + +#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT + +//! C++17 memory resource implementation for scalable allocator +//! ISO C++ Section 23.12.2 +class scalable_resource_impl : public std::pmr::memory_resource { +private: + void* do_allocate(std::size_t bytes, std::size_t alignment) override { + void* p = scalable_aligned_malloc(bytes, alignment); + if (!p) { + throw_exception(std::bad_alloc()); + } + return p; + } + + void do_deallocate(void* ptr, std::size_t /*bytes*/, std::size_t /*alignment*/) override { + scalable_free(ptr); + } + + //! Memory allocated by one instance of scalable_resource_impl could be deallocated by any + //! other instance of this class + bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { + return this == &other || +#if __TBB_USE_OPTIONAL_RTTI + dynamic_cast(&other) != nullptr; +#else + false; +#endif + } +}; + +//! Global scalable allocator memory resource provider +inline std::pmr::memory_resource* scalable_memory_resource() noexcept { + static tbb::detail::d1::scalable_resource_impl scalable_res; + return &scalable_res; +} + +#endif // __TBB_CPP17_MEMORY_RESOURCE_PRESENT + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::scalable_allocator; +#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT +using detail::d1::scalable_memory_resource; +#endif +} // namespace v1 + +} // namespace tbb + +#endif /* __cplusplus */ + +#endif /* __TBB_scalable_allocator_H */ diff --git a/third_party/tbb/scheduler_common.h b/third_party/tbb/scheduler_common.h new file mode 100644 index 000000000..8a0496bd6 --- /dev/null +++ b/third_party/tbb/scheduler_common.h @@ -0,0 +1,599 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_scheduler_common_H +#define _TBB_scheduler_common_H + +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/detail/_machine.h" +#include "third_party/tbb/task_group.h" +#include "third_party/tbb/cache_aligned_allocator.h" +#include "third_party/tbb/itt_notify.h" +#include "third_party/tbb/co_context.h" +#include "third_party/tbb/misc.h" +#include "third_party/tbb/governor.h" + +#ifndef __TBB_SCHEDULER_MUTEX_TYPE +#define __TBB_SCHEDULER_MUTEX_TYPE tbb::spin_mutex +#endif +// TODO: add conditional inclusion based on specified type +#include "third_party/tbb/spin_mutex.h" +#include "third_party/tbb/mutex.h" + +#if TBB_USE_ASSERT +#include "third_party/libcxx/atomic" +#endif + +#include "third_party/libcxx/cstdint" +#include "third_party/libcxx/exception" +#include "third_party/libcxx/memory" // unique_ptr + +//! Mutex type for global locks in the scheduler +using scheduler_mutex_type = __TBB_SCHEDULER_MUTEX_TYPE; + +#if _MSC_VER && !defined(__INTEL_COMPILER) + // Workaround for overzealous compiler warnings + // These particular warnings are so ubiquitous that no attempt is made to narrow + // the scope of the warnings. + #pragma warning (disable: 4100 4127 4312 4244 4267 4706) +#endif + +namespace tbb { +namespace detail { +namespace r1 { + +class arena; +class mail_inbox; +class mail_outbox; +class market; +class observer_proxy; + +enum task_stream_accessor_type { front_accessor = 0, back_nonnull_accessor }; +template class task_stream; + +using isolation_type = std::intptr_t; +constexpr isolation_type no_isolation = 0; + +struct cache_aligned_deleter { + template + void operator() (T* ptr) const { + ptr->~T(); + cache_aligned_deallocate(ptr); + } +}; + +template +using cache_aligned_unique_ptr = std::unique_ptr; + +template +cache_aligned_unique_ptr make_cache_aligned_unique(Args&& ...args) { + return cache_aligned_unique_ptr(new (cache_aligned_allocate(sizeof(T))) T(std::forward(args)...)); +} + +//------------------------------------------------------------------------ +// Extended execute data +//------------------------------------------------------------------------ + +//! Execute data used on a task dispatcher side, reflects a current execution state +struct execution_data_ext : d1::execution_data { + task_dispatcher* task_disp{}; + isolation_type isolation{}; + d1::wait_context* wait_ctx{}; +}; + +//------------------------------------------------------------------------ +// Task accessor +//------------------------------------------------------------------------ + +//! Interpretation of reserved task fields inside a task dispatcher +struct task_accessor { + static constexpr std::uint64_t proxy_task_trait = 1; + static constexpr std::uint64_t resume_task_trait = 2; + static d1::task_group_context*& context(d1::task& t) { + task_group_context** tgc = reinterpret_cast(&t.m_reserved[0]); + return *tgc; + } + static isolation_type& isolation(d1::task& t) { + isolation_type* tag = reinterpret_cast(&t.m_reserved[2]); + return *tag; + } + static void set_proxy_trait(d1::task& t) { + // TODO: refactor proxy tasks not to work on uninitialized memory. + //__TBB_ASSERT((t.m_version_and_traits & proxy_task_trait) == 0, nullptr); + t.m_version_and_traits |= proxy_task_trait; + } + static bool is_proxy_task(d1::task& t) { + return (t.m_version_and_traits & proxy_task_trait) != 0; + } + static void set_resume_trait(d1::task& t) { + __TBB_ASSERT((t.m_version_and_traits & resume_task_trait) == 0, nullptr); + t.m_version_and_traits |= resume_task_trait; + } + static bool is_resume_task(d1::task& t) { + return (t.m_version_and_traits & resume_task_trait) != 0; + } +}; + +//------------------------------------------------------------------------ +//! Extended variant of the standard offsetof macro +/** The standard offsetof macro is not sufficient for TBB as it can be used for + POD-types only. The constant 0x1000 (not nullptr) is necessary to appease GCC. **/ +#define __TBB_offsetof(class_name, member_name) \ + ((ptrdiff_t)&(reinterpret_cast(0x1000)->member_name) - 0x1000) + +//! Returns address of the object containing a member with the given name and address +#define __TBB_get_object_ref(class_name, member_name, member_addr) \ + (*reinterpret_cast((char*)member_addr - __TBB_offsetof(class_name, member_name))) + +//! Helper class for tracking floating point context and task group context switches +/** Assuming presence of an itt collector, in addition to keeping track of floating + point context, this class emits itt events to indicate begin and end of task group + context execution **/ +template +class context_guard_helper { + const d1::task_group_context* curr_ctx; + d1::cpu_ctl_env guard_cpu_ctl_env; + d1::cpu_ctl_env curr_cpu_ctl_env; +public: + context_guard_helper() : curr_ctx(nullptr) { + guard_cpu_ctl_env.get_env(); + curr_cpu_ctl_env = guard_cpu_ctl_env; + } + ~context_guard_helper() { + if (curr_cpu_ctl_env != guard_cpu_ctl_env) + guard_cpu_ctl_env.set_env(); + if (report_tasks && curr_ctx) + ITT_TASK_END; + } + // The function is called from bypass dispatch loop on the hot path. + // Consider performance issues when refactoring. + void set_ctx(const d1::task_group_context* ctx) { + if (!ctx) + return; + const d1::cpu_ctl_env* ctl = reinterpret_cast(&ctx->my_cpu_ctl_env); + // Compare the FPU settings directly because the context can be reused between parallel algorithms. + if (*ctl != curr_cpu_ctl_env) { + curr_cpu_ctl_env = *ctl; + curr_cpu_ctl_env.set_env(); + } + if (report_tasks && ctx != curr_ctx) { + // if task group context was active, report end of current execution frame. + if (curr_ctx) + ITT_TASK_END; + // reporting begin of new task group context execution frame. + // using address of task group context object to group tasks (parent). + // id of task execution frame is nullptr and reserved for future use. + ITT_TASK_BEGIN(ctx, ctx->my_name, nullptr); + curr_ctx = ctx; + } + } +#if _WIN64 + void restore_default() { + if (curr_cpu_ctl_env != guard_cpu_ctl_env) { + guard_cpu_ctl_env.set_env(); + curr_cpu_ctl_env = guard_cpu_ctl_env; + } + } +#endif // _WIN64 +}; + +#if (_WIN32 || _WIN64 || __unix__ || __APPLE__) && (__TBB_x86_32 || __TBB_x86_64) +#if _MSC_VER +#pragma intrinsic(__rdtsc) +#endif +inline std::uint64_t machine_time_stamp() { +#if __INTEL_COMPILER + return _rdtsc(); +#elif _MSC_VER + return __rdtsc(); +#else + std::uint32_t hi, lo; + __asm__ __volatile__("rdtsc" : "=d"(hi), "=a"(lo)); + return (std::uint64_t(hi) << 32) | lo; +#endif +} + +inline void prolonged_pause_impl() { + // Assumption based on practice: 1000-2000 ticks seems to be a suitable invariant for the + // majority of platforms. Currently, skip platforms that define __TBB_STEALING_PAUSE + // because these platforms require very careful tuning. + std::uint64_t prev = machine_time_stamp(); + const std::uint64_t finish = prev + 1000; + atomic_backoff backoff; + do { + backoff.bounded_pause(); + std::uint64_t curr = machine_time_stamp(); + if (curr <= prev) + // Possibly, the current logical thread is moved to another hardware thread or overflow is occurred. + break; + prev = curr; + } while (prev < finish); +} +#else +inline void prolonged_pause_impl() { +#ifdef __TBB_ipf + static const long PauseTime = 1500; +#else + static const long PauseTime = 80; +#endif + // TODO IDEA: Update PauseTime adaptively? + machine_pause(PauseTime); +} +#endif + +inline void prolonged_pause() { +#if __TBB_WAITPKG_INTRINSICS_PRESENT + if (governor::wait_package_enabled()) { + std::uint64_t time_stamp = machine_time_stamp(); + // _tpause function directs the processor to enter an implementation-dependent optimized state + // until the Time Stamp Counter reaches or exceeds the value specified in second parameter. + // Constant "700" is ticks to wait for. + // First parameter 0 selects between a lower power (cleared) or faster wakeup (set) optimized state. + _tpause(0, time_stamp + 700); + } + else +#endif + prolonged_pause_impl(); +} + +// TODO: investigate possibility to work with number of CPU cycles +// because for different configurations this number of pauses + yields +// will be calculated in different amount of CPU cycles +// for example use rdtsc for it +class stealing_loop_backoff { + const int my_pause_threshold; + const int my_yield_threshold; + int my_pause_count; + int my_yield_count; +public: + // my_yield_threshold = 100 is an experimental value. Ideally, once we start calling __TBB_Yield(), + // the time spent spinning before calling out_of_work() should be approximately + // the time it takes for a thread to be woken up. Doing so would guarantee that we do + // no worse than 2x the optimal spin time. Or perhaps a time-slice quantum is the right amount. + stealing_loop_backoff(int num_workers, int yields_multiplier) + : my_pause_threshold{ 2 * (num_workers + 1) } +#if __APPLE__ + // threshold value tuned separately for macOS due to high cost of sched_yield there + , my_yield_threshold{10 * yields_multiplier} +#else + , my_yield_threshold{100 * yields_multiplier} +#endif + , my_pause_count{} + , my_yield_count{} + {} + bool pause() { + prolonged_pause(); + if (my_pause_count++ >= my_pause_threshold) { + my_pause_count = my_pause_threshold; + d0::yield(); + if (my_yield_count++ >= my_yield_threshold) { + my_yield_count = my_yield_threshold; + return true; + } + } + return false; + } + void reset_wait() { + my_pause_count = my_yield_count = 0; + } +}; + +//------------------------------------------------------------------------ +// Exception support +//------------------------------------------------------------------------ +//! Task group state change propagation global epoch +/** Together with generic_scheduler::my_context_state_propagation_epoch forms + cross-thread signaling mechanism that allows to avoid locking at the hot path + of normal execution flow. + + When a descendant task group context is registered or unregistered, the global + and local epochs are compared. If they differ, a state change is being propagated, + and thus registration/deregistration routines take slower branch that may block + (at most one thread of the pool can be blocked at any moment). Otherwise the + control path is lock-free and fast. **/ +extern std::atomic the_context_state_propagation_epoch; + +//! Mutex guarding state change propagation across task groups forest. +/** Also protects modification of related data structures. **/ +typedef scheduler_mutex_type context_state_propagation_mutex_type; +extern context_state_propagation_mutex_type the_context_state_propagation_mutex; + +class tbb_exception_ptr { + std::exception_ptr my_ptr; +public: + static tbb_exception_ptr* allocate() noexcept; + + //! Destroys this objects + /** Note that objects of this type can be created only by the allocate() method. **/ + void destroy() noexcept; + + //! Throws the contained exception . + void throw_self(); + +private: + tbb_exception_ptr(const std::exception_ptr& src) : my_ptr(src) {} +}; // class tbb_exception_ptr + +//------------------------------------------------------------------------ +// Debugging support +//------------------------------------------------------------------------ + +#if TBB_USE_ASSERT +static const std::uintptr_t venom = tbb::detail::select_size_t_constant<0xDEADBEEFU, 0xDDEEAADDDEADBEEFULL>::value; + +inline void poison_value(std::uintptr_t& val) { val = venom; } + +inline void poison_value(std::atomic& val) { val.store(venom, std::memory_order_relaxed); } + +/** Expected to be used in assertions only, thus no empty form is defined. **/ +inline bool is_alive(std::uintptr_t v) { return v != venom; } + +/** Logically, this method should be a member of class task. + But we do not want to publish it, so it is here instead. */ +inline void assert_task_valid(const d1::task* t) { + assert_pointer_valid(t); +} +#else /* !TBB_USE_ASSERT */ + +/** In contrast to debug version poison_value() is a macro here because + the variable used as its argument may be undefined in release builds. **/ +#define poison_value(g) ((void)0) + +inline void assert_task_valid(const d1::task*) {} + +#endif /* !TBB_USE_ASSERT */ + +struct suspend_point_type { +#if __TBB_RESUMABLE_TASKS + //! The arena related to this task_dispatcher + arena* m_arena{ nullptr }; + //! The random for the resume task + FastRandom m_random; + //! The flag is raised when the original owner should return to this task dispatcher. + std::atomic m_is_owner_recalled{ false }; + //! Inicates if the resume task should be placed to the critical task stream. + bool m_is_critical{ false }; + //! Associated coroutine + co_context m_co_context; + //! Supend point before resume + suspend_point_type* m_prev_suspend_point{nullptr}; + + // Possible state transitions: + // A -> S -> N -> A + // A -> N -> S -> N -> A + enum class stack_state { + active, // some thread is working with this stack + suspended, // no thread is working with this stack + notified // some thread tried to resume this stack + }; + + //! The flag required to protect suspend finish and resume call + std::atomic m_stack_state{stack_state::active}; + + void resume(suspend_point_type* sp) { + __TBB_ASSERT(m_stack_state.load(std::memory_order_relaxed) != stack_state::suspended, "The stack is expected to be active"); + + sp->m_prev_suspend_point = this; + + // Do not access sp after resume + m_co_context.resume(sp->m_co_context); + __TBB_ASSERT(m_stack_state.load(std::memory_order_relaxed) != stack_state::active, nullptr); + + finilize_resume(); + } + + void finilize_resume() { + m_stack_state.store(stack_state::active, std::memory_order_relaxed); + // Set the suspended state for the stack that we left. If the state is already notified, it means that + // someone already tried to resume our previous stack but failed. So, we need to resume it. + // m_prev_suspend_point might be nullptr when destroying co_context based on threads + if (m_prev_suspend_point && m_prev_suspend_point->m_stack_state.exchange(stack_state::suspended) == stack_state::notified) { + r1::resume(m_prev_suspend_point); + } + m_prev_suspend_point = nullptr; + } + + bool try_notify_resume() { + // Check that stack is already suspended. Return false if not yet. + return m_stack_state.exchange(stack_state::notified) == stack_state::suspended; + } + + void recall_owner() { + __TBB_ASSERT(m_stack_state.load(std::memory_order_relaxed) == stack_state::suspended, nullptr); + m_stack_state.store(stack_state::notified, std::memory_order_relaxed); + m_is_owner_recalled.store(true, std::memory_order_release); + } + + struct resume_task final : public d1::task { + task_dispatcher& m_target; + explicit resume_task(task_dispatcher& target) : m_target(target) { + task_accessor::set_resume_trait(*this); + } + d1::task* execute(d1::execution_data& ed) override; + d1::task* cancel(d1::execution_data&) override { + __TBB_ASSERT(false, "The resume task cannot be canceled"); + return nullptr; + } + } m_resume_task; + + suspend_point_type(arena* a, std::size_t stack_size, task_dispatcher& target); +#endif /*__TBB_RESUMABLE_TASKS */ +}; + +#if _MSC_VER && !defined(__INTEL_COMPILER) +// structure was padded due to alignment specifier +#pragma warning( push ) +#pragma warning( disable: 4324 ) +#endif + +class alignas (max_nfs_size) task_dispatcher { +public: + // TODO: reconsider low level design to better organize dependencies and files. + friend class thread_data; + friend class arena_slot; + friend class nested_arena_context; + friend class delegated_task; + friend struct base_waiter; + + //! The list of possible post resume actions. + enum class post_resume_action { + invalid, + register_waiter, + cleanup, + notify, + none + }; + + //! The data of the current thread attached to this task_dispatcher + thread_data* m_thread_data{ nullptr }; + + //! The current execution data + execution_data_ext m_execute_data_ext; + + //! Properties + struct properties { + bool outermost{ true }; + bool fifo_tasks_allowed{ true }; + bool critical_task_allowed{ true }; + } m_properties; + + //! Position in the call stack when stealing is still allowed. + std::uintptr_t m_stealing_threshold{}; + + //! Suspend point (null if this task dispatcher has been never suspended) + suspend_point_type* m_suspend_point{ nullptr }; + + //! Attempt to get a task from the mailbox. + /** Gets a task only if it has not been executed by its sender or a thief + that has stolen it from the sender's task pool. Otherwise returns nullptr. + This method is intended to be used only by the thread extracting the proxy + from its mailbox. (In contrast to local task pool, mailbox can be read only + by its owner). **/ + d1::task* get_mailbox_task(mail_inbox& my_inbox, execution_data_ext& ed, isolation_type isolation); + + d1::task* get_critical_task(d1::task*, execution_data_ext&, isolation_type, bool); + + template + d1::task* receive_or_steal_task(thread_data& tls, execution_data_ext& ed, Waiter& waiter, + isolation_type isolation, bool outermost, bool criticality_absence); + + template + d1::task* local_wait_for_all(d1::task * t, Waiter& waiter); + + task_dispatcher(const task_dispatcher&) = delete; + + bool can_steal(); +public: + task_dispatcher(arena* a); + + ~task_dispatcher() { + if (m_suspend_point) { + m_suspend_point->~suspend_point_type(); + cache_aligned_deallocate(m_suspend_point); + } + poison_pointer(m_thread_data); + poison_pointer(m_suspend_point); + } + + template + d1::task* local_wait_for_all(d1::task* t, Waiter& waiter); + + bool allow_fifo_task(bool new_state) { + bool old_state = m_properties.fifo_tasks_allowed; + m_properties.fifo_tasks_allowed = new_state; + return old_state; + } + + isolation_type set_isolation(isolation_type isolation) { + isolation_type prev = m_execute_data_ext.isolation; + m_execute_data_ext.isolation = isolation; + return prev; + } + + thread_data& get_thread_data() { + __TBB_ASSERT(m_thread_data, nullptr); + return *m_thread_data; + } + + static void execute_and_wait(d1::task* t, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx); + + void set_stealing_threshold(std::uintptr_t stealing_threshold) { + bool assert_condition = (stealing_threshold == 0 && m_stealing_threshold != 0) || + (stealing_threshold != 0 && m_stealing_threshold == 0); + __TBB_ASSERT_EX( assert_condition, nullptr ); + m_stealing_threshold = stealing_threshold; + } + + d1::task* get_inbox_or_critical_task(execution_data_ext&, mail_inbox&, isolation_type, bool); + d1::task* get_stream_or_critical_task(execution_data_ext&, arena&, task_stream&, + unsigned& /*hint_for_stream*/, isolation_type, + bool /*critical_allowed*/); + d1::task* steal_or_get_critical(execution_data_ext&, arena&, unsigned /*arena_index*/, FastRandom&, + isolation_type, bool /*critical_allowed*/); + +#if __TBB_RESUMABLE_TASKS + /* [[noreturn]] */ void co_local_wait_for_all() noexcept; + void suspend(suspend_callback_type suspend_callback, void* user_callback); + void internal_suspend(); + void do_post_resume_action(); + + bool resume(task_dispatcher& target); + suspend_point_type* get_suspend_point(); + void init_suspend_point(arena* a, std::size_t stack_size); + friend void internal_resume(suspend_point_type*); + void recall_point(); +#endif /* __TBB_RESUMABLE_TASKS */ +}; + +#if _MSC_VER && !defined(__INTEL_COMPILER) +#pragma warning( pop ) +#endif + +inline std::uintptr_t calculate_stealing_threshold(std::uintptr_t base, std::size_t stack_size) { + __TBB_ASSERT(stack_size != 0, "Stack size cannot be zero"); + __TBB_ASSERT(base > stack_size / 2, "Stack anchor calculation overflow"); + return base - stack_size / 2; +} + +struct task_group_context_impl { + static void destroy(d1::task_group_context&); + static void initialize(d1::task_group_context&); + static void register_with(d1::task_group_context&, thread_data*); + static void bind_to_impl(d1::task_group_context&, thread_data*); + static void bind_to(d1::task_group_context&, thread_data*); + static void propagate_task_group_state(d1::task_group_context&, std::atomic d1::task_group_context::*, d1::task_group_context&, uint32_t); + static bool cancel_group_execution(d1::task_group_context&); + static bool is_group_execution_cancelled(const d1::task_group_context&); + static void reset(d1::task_group_context&); + static void capture_fp_settings(d1::task_group_context&); + static void copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src); +}; + + +//! Forward declaration for scheduler entities +bool gcc_rethrow_exception_broken(); +void fix_broken_rethrow(); +//! Forward declaration: throws std::runtime_error with what() returning error_code description prefixed with aux_info +void handle_perror(int error_code, const char* aux_info); + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* _TBB_scheduler_common_H */ diff --git a/third_party/tbb/semaphore.cpp b/third_party/tbb/semaphore.cpp new file mode 100644 index 000000000..a1ac96b3d --- /dev/null +++ b/third_party/tbb/semaphore.cpp @@ -0,0 +1,93 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/semaphore.h" +#if __TBB_USE_SRWLOCK +#include "third_party/tbb/dynamic_link.h" // Refers to src/tbb, not include/tbb +// MISSING #include "tbb_misc.h" +#endif + +namespace tbb { +namespace detail { +namespace r1 { + +// TODO: For new win UI port, we can use SRWLock API without dynamic_link etc. +#if __TBB_USE_SRWLOCK + +static std::atomic concmon_module_inited; + +void WINAPI init_binsem_using_event( SRWLOCK* h_ ) +{ + srwl_or_handle* shptr = (srwl_or_handle*) h_; + shptr->h = CreateEventEx( nullptr, nullptr, 0, EVENT_ALL_ACCESS|SEMAPHORE_ALL_ACCESS ); +} + +void WINAPI acquire_binsem_using_event( SRWLOCK* h_ ) +{ + srwl_or_handle* shptr = (srwl_or_handle*) h_; + WaitForSingleObjectEx( shptr->h, INFINITE, FALSE ); +} + +void WINAPI release_binsem_using_event( SRWLOCK* h_ ) +{ + srwl_or_handle* shptr = (srwl_or_handle*) h_; + SetEvent( shptr->h ); +} + +static void (WINAPI *__TBB_init_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&init_binsem_using_event; +static void (WINAPI *__TBB_acquire_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&acquire_binsem_using_event; +static void (WINAPI *__TBB_release_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&release_binsem_using_event; + +//! Table describing the how to link the handlers. +static const dynamic_link_descriptor SRWLLinkTable[] = { + DLD(InitializeSRWLock, __TBB_init_binsem), + DLD(AcquireSRWLockExclusive, __TBB_acquire_binsem), + DLD(ReleaseSRWLockExclusive, __TBB_release_binsem) +}; + +inline void init_concmon_module() +{ + __TBB_ASSERT( (uintptr_t)__TBB_init_binsem==(uintptr_t)&init_binsem_using_event, nullptr); + if( dynamic_link( "Kernel32.dll", SRWLLinkTable, sizeof(SRWLLinkTable)/sizeof(dynamic_link_descriptor) ) ) { + __TBB_ASSERT( (uintptr_t)__TBB_init_binsem!=(uintptr_t)&init_binsem_using_event, nullptr); + __TBB_ASSERT( (uintptr_t)__TBB_acquire_binsem!=(uintptr_t)&acquire_binsem_using_event, nullptr); + __TBB_ASSERT( (uintptr_t)__TBB_release_binsem!=(uintptr_t)&release_binsem_using_event, nullptr); + } +} + +binary_semaphore::binary_semaphore() { + atomic_do_once( &init_concmon_module, concmon_module_inited ); + + __TBB_init_binsem( &my_sem.lock ); + if( (uintptr_t)__TBB_init_binsem!=(uintptr_t)&init_binsem_using_event ) + P(); +} + +binary_semaphore::~binary_semaphore() { + if( (uintptr_t)__TBB_init_binsem==(uintptr_t)&init_binsem_using_event ) + CloseHandle( my_sem.h ); +} + +void binary_semaphore::P() { __TBB_acquire_binsem( &my_sem.lock ); } + +void binary_semaphore::V() { __TBB_release_binsem( &my_sem.lock ); } + +#endif /* __TBB_USE_SRWLOCK */ + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/semaphore.h b/third_party/tbb/semaphore.h new file mode 100644 index 000000000..281d18516 --- /dev/null +++ b/third_party/tbb/semaphore.h @@ -0,0 +1,331 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_semaphore_H +#define __TBB_semaphore_H + +#include "third_party/tbb/detail/_utils.h" + +#if _WIN32||_WIN64 +#include "libc/nt/accounting.h" +#include "libc/nt/automation.h" +#include "libc/nt/console.h" +#include "libc/nt/debug.h" +#include "libc/nt/dll.h" +#include "libc/nt/enum/keyaccess.h" +#include "libc/nt/enum/regtype.h" +#include "libc/nt/errors.h" +#include "libc/nt/events.h" +#include "libc/nt/files.h" +#include "libc/nt/ipc.h" +#include "libc/nt/memory.h" +#include "libc/nt/paint.h" +#include "libc/nt/process.h" +#include "libc/nt/registry.h" +#include "libc/nt/synchronization.h" +#include "libc/nt/thread.h" +#include "libc/nt/windows.h" +#include "libc/nt/winsock.h" +#elif __APPLE__ +// MISSING #include +#else +#include "libc/thread/semaphore.h" +#ifdef TBB_USE_DEBUG +#include "third_party/libcxx/cerrno" +#endif +#endif /*_WIN32||_WIN64*/ + +#include "third_party/libcxx/atomic" + +#if __unix__ +#if defined(__has_include) +#define __TBB_has_include __has_include +#else +#define __TBB_has_include(x) 0 +#endif + +/* Futex definitions */ +#include "libc/calls/calls.h" +#include "libc/calls/weirdtypes.h" +#include "libc/runtime/pathconf.h" +#include "libc/runtime/runtime.h" +#include "libc/runtime/sysconf.h" +#include "libc/sysv/consts/f.h" +#include "libc/sysv/consts/fileno.h" +#include "libc/sysv/consts/o.h" +#include "libc/sysv/consts/ok.h" +#include "libc/time/time.h" +#include "third_party/getopt/getopt.h" +#include "third_party/musl/crypt.h" +#include "third_party/musl/lockf.h" +#if defined(__linux__) || __TBB_has_include() +#include "libc/stdio/syscall.h" +#endif + +#if defined(SYS_futex) + +/* This section is included for Linux and some other systems that may support futexes.*/ + +#define __TBB_USE_FUTEX 1 + +/* +If available, use typical headers where futex API is defined. While Linux and OpenBSD +are known to provide such headers, other systems might have them as well. +*/ +#if defined(__linux__) || __TBB_has_include() +#include "libc/sysv/consts/futex.h" +#include "libc/sysv/consts/nr.h" +#elif defined(__OpenBSD__) || __TBB_has_include() +// MISSING #include +#endif + +#include "third_party/libcxx/climits" +#include "third_party/libcxx/cerrno" + +/* +Some systems might not define the macros or use different names. In such case we expect +the actual parameter values to match Linux: 0 for wait, 1 for wake. +*/ +#if defined(FUTEX_WAIT_PRIVATE) +#define __TBB_FUTEX_WAIT FUTEX_WAIT_PRIVATE +#elif defined(FUTEX_WAIT) +#define __TBB_FUTEX_WAIT FUTEX_WAIT +#else +#define __TBB_FUTEX_WAIT 0 +#endif + +#if defined(FUTEX_WAKE_PRIVATE) +#define __TBB_FUTEX_WAKE FUTEX_WAKE_PRIVATE +#elif defined(FUTEX_WAKE) +#define __TBB_FUTEX_WAKE FUTEX_WAKE +#else +#define __TBB_FUTEX_WAKE 1 +#endif + +#endif // SYS_futex +#endif // __unix__ + +namespace tbb { +namespace detail { +namespace r1 { + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Futex implementation +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#if __TBB_USE_FUTEX + +static inline int futex_wait( void *futex, int comparand ) { + int r = ::syscall(SYS_futex, futex, __TBB_FUTEX_WAIT, comparand, nullptr, nullptr, 0); +#if TBB_USE_ASSERT + int e = errno; + __TBB_ASSERT(r == 0 || r == EWOULDBLOCK || (r == -1 && (e == EAGAIN || e == EINTR)), "futex_wait failed."); +#endif /* TBB_USE_ASSERT */ + return r; +} + +static inline int futex_wakeup_one( void *futex ) { + int r = ::syscall(SYS_futex, futex, __TBB_FUTEX_WAKE, 1, nullptr, nullptr, 0); + __TBB_ASSERT(r == 0 || r == 1, "futex_wakeup_one: more than one thread woken up?"); + return r; +} + +// Additional possible methods that are not required right now +// static inline int futex_wakeup_all( void *futex ) { +// int r = ::syscall( SYS_futex,futex,__TBB_FUTEX_WAKE,INT_MAX,nullptr,nullptr,0 ); +// __TBB_ASSERT( r>=0, "futex_wakeup_all: error in waking up threads" ); +// return r; +// } + +#endif // __TBB_USE_FUTEX + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#if _WIN32||_WIN64 +typedef LONG sem_count_t; +//! Edsger Dijkstra's counting semaphore +class semaphore : no_copy { + static const int max_semaphore_cnt = MAXLONG; +public: + //! ctor + semaphore(size_t start_cnt_ = 0) {init_semaphore(start_cnt_);} + //! dtor + ~semaphore() {CloseHandle( sem );} + //! wait/acquire + void P() {WaitForSingleObjectEx( sem, INFINITE, FALSE );} + //! post/release + void V() {ReleaseSemaphore( sem, 1, nullptr);} +private: + HANDLE sem; + void init_semaphore(size_t start_cnt_) { + sem = CreateSemaphoreEx( nullptr, LONG(start_cnt_), max_semaphore_cnt, nullptr, 0, SEMAPHORE_ALL_ACCESS ); + } +}; +#elif __APPLE__ +//! Edsger Dijkstra's counting semaphore +class semaphore : no_copy { +public: + //! ctor + semaphore(int start_cnt_ = 0) { my_sem = dispatch_semaphore_create(start_cnt_); } + //! dtor + ~semaphore() { dispatch_release(my_sem); } + //! wait/acquire + void P() { + std::intptr_t ret = dispatch_semaphore_wait(my_sem, DISPATCH_TIME_FOREVER); + __TBB_ASSERT_EX(ret == 0, "dispatch_semaphore_wait() failed"); + } + //! post/release + void V() { dispatch_semaphore_signal(my_sem); } +private: + dispatch_semaphore_t my_sem; +}; +#else /* Linux/Unix */ +typedef uint32_t sem_count_t; +//! Edsger Dijkstra's counting semaphore +class semaphore : no_copy { +public: + //! ctor + semaphore(int start_cnt_ = 0 ) { init_semaphore( start_cnt_ ); } + + //! dtor + ~semaphore() { + int ret = sem_destroy( &sem ); + __TBB_ASSERT_EX( !ret, nullptr); + } + //! wait/acquire + void P() { + while( sem_wait( &sem )!=0 ) + __TBB_ASSERT( errno==EINTR, nullptr); + } + //! post/release + void V() { sem_post( &sem ); } +private: + sem_t sem; + void init_semaphore(int start_cnt_) { + int ret = sem_init( &sem, /*shared among threads*/ 0, start_cnt_ ); + __TBB_ASSERT_EX( !ret, nullptr); + } +}; +#endif /* _WIN32||_WIN64 */ + + +//! for performance reasons, we want specialized binary_semaphore +#if _WIN32||_WIN64 +#if !__TBB_USE_SRWLOCK +//! binary_semaphore for concurrent_monitor +class binary_semaphore : no_copy { +public: + //! ctor + binary_semaphore() { my_sem = CreateEventEx( nullptr, nullptr, 0, EVENT_ALL_ACCESS ); } + //! dtor + ~binary_semaphore() { CloseHandle( my_sem ); } + //! wait/acquire + void P() { WaitForSingleObjectEx( my_sem, INFINITE, FALSE ); } + //! post/release + void V() { SetEvent( my_sem ); } +private: + HANDLE my_sem; +}; +#else /* __TBB_USE_SRWLOCK */ + +union srwl_or_handle { + SRWLOCK lock; + HANDLE h; +}; + +//! binary_semaphore for concurrent_monitor +class binary_semaphore : no_copy { +public: + //! ctor + binary_semaphore(); + //! dtor + ~binary_semaphore(); + //! wait/acquire + void P(); + //! post/release + void V(); +private: + srwl_or_handle my_sem; +}; +#endif /* !__TBB_USE_SRWLOCK */ +#elif __APPLE__ +//! binary_semaphore for concurrent monitor +using binary_semaphore = semaphore; +#else /* Linux/Unix */ + +#if __TBB_USE_FUTEX +class binary_semaphore : no_copy { +// The implementation is equivalent to the "Mutex, Take 3" one +// in the paper "Futexes Are Tricky" by Ulrich Drepper +public: + //! ctor + binary_semaphore() { my_sem = 1; } + //! dtor + ~binary_semaphore() {} + //! wait/acquire + void P() { + int s = 0; + if( !my_sem.compare_exchange_strong( s, 1 ) ) { + if( s!=2 ) + s = my_sem.exchange( 2 ); + while( s!=0 ) { // This loop deals with spurious wakeup + futex_wait( &my_sem, 2 ); + s = my_sem.exchange( 2 ); + } + } + } + //! post/release + void V() { + __TBB_ASSERT( my_sem.load(std::memory_order_relaxed)>=1, "multiple V()'s in a row?" ); + if( my_sem.exchange( 0 )==2 ) + futex_wakeup_one( &my_sem ); + } +private: + std::atomic my_sem; // 0 - open; 1 - closed, no waits; 2 - closed, possible waits +}; +#else +typedef uint32_t sem_count_t; +//! binary_semaphore for concurrent monitor +class binary_semaphore : no_copy { +public: + //! ctor + binary_semaphore() { + int ret = sem_init( &my_sem, /*shared among threads*/ 0, 0 ); + __TBB_ASSERT_EX( !ret, nullptr); + } + //! dtor + ~binary_semaphore() { + int ret = sem_destroy( &my_sem ); + __TBB_ASSERT_EX( !ret, nullptr); + } + //! wait/acquire + void P() { + while( sem_wait( &my_sem )!=0 ) + __TBB_ASSERT( errno==EINTR, nullptr); + } + //! post/release + void V() { sem_post( &my_sem ); } +private: + sem_t my_sem; +}; +#endif /* __TBB_USE_FUTEX */ +#endif /* _WIN32||_WIN64 */ + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* __TBB_semaphore_H */ diff --git a/third_party/tbb/small_object_pool.cpp b/third_party/tbb/small_object_pool.cpp new file mode 100644 index 000000000..74a970d9d --- /dev/null +++ b/third_party/tbb/small_object_pool.cpp @@ -0,0 +1,155 @@ +// clang-format off +/* + Copyright (c) 2020-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/cache_aligned_allocator.h" +#include "third_party/tbb/detail/_small_object_pool.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/thread_data.h" +#include "third_party/tbb/task_dispatcher.h" + +#include "third_party/libcxx/cstddef" + +namespace tbb { +namespace detail { +namespace r1 { + +small_object_pool_impl::small_object* const small_object_pool_impl::dead_public_list = + reinterpret_cast(1); + +void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& allocator, std::size_t number_of_bytes, const d1::execution_data& ed) { + auto& tls = static_cast(ed).task_disp->get_thread_data(); + auto pool = tls.my_small_object_pool; + return pool->allocate_impl(allocator, number_of_bytes); +} + +void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& allocator, std::size_t number_of_bytes) { + // TODO: optimize if the allocator contains a valid pool. + auto tls = governor::get_thread_data(); + auto pool = tls->my_small_object_pool; + return pool->allocate_impl(allocator, number_of_bytes); +} + +void* small_object_pool_impl::allocate_impl(d1::small_object_pool*& allocator, std::size_t number_of_bytes) +{ + small_object* obj{nullptr}; + + if (number_of_bytes <= small_object_size) { + if (m_private_list) { + obj = m_private_list; + m_private_list = m_private_list->next; + } else if (m_public_list.load(std::memory_order_relaxed)) { + // No fence required for read of my_public_list above, because std::atomic::exchange() has a fence. + obj = m_public_list.exchange(nullptr); + __TBB_ASSERT( obj, "another thread emptied the my_public_list" ); + m_private_list = obj->next; + } else { + obj = new (cache_aligned_allocate(small_object_size)) small_object{nullptr}; + ++m_private_counter; + } + } else { + obj = new (cache_aligned_allocate(number_of_bytes)) small_object{nullptr}; + } + allocator = this; + + // Return uninitialized memory for further construction on user side. + obj->~small_object(); + return obj; +} + +void __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& allocator, void* ptr, std::size_t number_of_bytes) { + auto pool = static_cast(&allocator); + auto tls = governor::get_thread_data(); + pool->deallocate_impl(ptr, number_of_bytes, *tls); +} + +void __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& allocator, void* ptr, std::size_t number_of_bytes, const d1::execution_data& ed) { + auto& tls = static_cast(ed).task_disp->get_thread_data(); + auto pool = static_cast(&allocator); + pool->deallocate_impl(ptr, number_of_bytes, tls); +} + +void small_object_pool_impl::deallocate_impl(void* ptr, std::size_t number_of_bytes, thread_data& td) { + __TBB_ASSERT(ptr != nullptr, "pointer to deallocate should not be null"); + __TBB_ASSERT(number_of_bytes >= sizeof(small_object), "number of bytes should be at least sizeof(small_object)"); + + if (number_of_bytes <= small_object_size) { + auto obj = new (ptr) small_object{nullptr}; + if (td.my_small_object_pool == this) { + obj->next = m_private_list; + m_private_list = obj; + } else { + auto old_public_list = m_public_list.load(std::memory_order_relaxed); + + for (;;) { + if (old_public_list == dead_public_list) { + obj->~small_object(); + cache_aligned_deallocate(obj); + if (++m_public_counter == 0) + { + this->~small_object_pool_impl(); + cache_aligned_deallocate(this); + } + break; + } + obj->next = old_public_list; + if (m_public_list.compare_exchange_strong(old_public_list, obj)) { + break; + } + } + } + } else { + cache_aligned_deallocate(ptr); + } +} + +std::int64_t small_object_pool_impl::cleanup_list(small_object* list) +{ + std::int64_t removed_count{}; + + while (list) { + small_object* current = list; + list = list->next; + current->~small_object(); + cache_aligned_deallocate(current); + ++removed_count; + } + return removed_count; +} + +void small_object_pool_impl::destroy() +{ + // clean up private list and subtract the removed count from private counter + m_private_counter -= cleanup_list(m_private_list); + // Grab public list and place dead mark + small_object* public_list = m_public_list.exchange(dead_public_list); + // clean up public list and subtract from private (intentionally) counter + m_private_counter -= cleanup_list(public_list); + __TBB_ASSERT(m_private_counter >= 0, "Private counter may not be less than 0"); + // Equivalent to fetch_sub(m_private_counter) - m_private_counter. But we need to do it + // atomically with operator-= not to access m_private_counter after the subtraction. + auto new_value = m_public_counter -= m_private_counter; + // check if this method is responsible to clean up the resources + if (new_value == 0) { + this->~small_object_pool_impl(); + cache_aligned_deallocate(this); + } +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/small_object_pool_impl.h b/third_party/tbb/small_object_pool_impl.h new file mode 100644 index 000000000..7478880a9 --- /dev/null +++ b/third_party/tbb/small_object_pool_impl.h @@ -0,0 +1,60 @@ +// clang-format off +/* + Copyright (c) 2020-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_small_object_pool_impl_H +#define __TBB_small_object_pool_impl_H + +#include "third_party/tbb/detail/_small_object_pool.h" +#include "third_party/tbb/detail/_utils.h" + +#include "third_party/libcxx/cstddef" +#include "third_party/libcxx/cstdint" +#include "third_party/libcxx/atomic" + + +namespace tbb { +namespace detail { +namespace r1 { + +class thread_data; + +class small_object_pool_impl : public d1::small_object_pool +{ + static constexpr std::size_t small_object_size = 256; + struct small_object { + small_object* next; + }; + static small_object* const dead_public_list; +public: + void* allocate_impl(small_object_pool*& allocator, std::size_t number_of_bytes); + void deallocate_impl(void* ptr, std::size_t number_of_bytes, thread_data& td); + void destroy(); +private: + static std::int64_t cleanup_list(small_object* list); + ~small_object_pool_impl() = default; +private: + alignas(max_nfs_size) small_object* m_private_list; + std::int64_t m_private_counter{}; + alignas(max_nfs_size) std::atomic m_public_list; + std::atomic m_public_counter{}; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* __TBB_small_object_pool_impl_H */ diff --git a/third_party/tbb/spin_mutex.h b/third_party/tbb/spin_mutex.h new file mode 100644 index 000000000..69d1047bb --- /dev/null +++ b/third_party/tbb/spin_mutex.h @@ -0,0 +1,135 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_spin_mutex_H +#define __TBB_spin_mutex_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_mutex_common.h" + +#include "third_party/tbb/profiling.h" + +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_scoped_lock.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace d1 { + +#if __TBB_TSX_INTRINSICS_PRESENT +class rtm_mutex; +#endif + +/** A spin_mutex is a low-level synchronization primitive. + While locked, it causes the waiting threads to spin in a loop until the lock is released. + It should be used only for locking short critical sections + (typically less than 20 instructions) when fairness is not an issue. + If zero-initialized, the mutex is considered unheld. + @ingroup synchronization */ +class spin_mutex { +public: + //! Constructors + spin_mutex() noexcept : m_flag(false) { + create_itt_sync(this, "tbb::spin_mutex", ""); + }; + + //! Destructor + ~spin_mutex() = default; + + //! No Copy + spin_mutex(const spin_mutex&) = delete; + spin_mutex& operator=(const spin_mutex&) = delete; + + using scoped_lock = unique_scoped_lock; + + //! Mutex traits + static constexpr bool is_rw_mutex = false; + static constexpr bool is_recursive_mutex = false; + static constexpr bool is_fair_mutex = false; + + //! Acquire lock + /** Spin if the lock is taken */ + void lock() { + atomic_backoff backoff; + call_itt_notify(prepare, this); + while (m_flag.exchange(true)) backoff.pause(); + call_itt_notify(acquired, this); + } + + //! Try acquiring lock (non-blocking) + /** Return true if lock acquired; false otherwise. */ + bool try_lock() { + bool result = !m_flag.exchange(true); + if (result) { + call_itt_notify(acquired, this); + } + return result; + } + + //! Release lock + void unlock() { + call_itt_notify(releasing, this); + m_flag.store(false, std::memory_order_release); + } + +protected: + std::atomic m_flag; +}; // class spin_mutex + +#if TBB_USE_PROFILING_TOOLS +inline void set_name(spin_mutex& obj, const char* name) { + itt_set_sync_name(&obj, name); +} +#if (_WIN32||_WIN64) +inline void set_name(spin_mutex& obj, const wchar_t* name) { + itt_set_sync_name(&obj, name); +} +#endif //WIN +#else +inline void set_name(spin_mutex&, const char*) {} +#if (_WIN32||_WIN64) +inline void set_name(spin_mutex&, const wchar_t*) {} +#endif // WIN +#endif +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::spin_mutex; +} // namespace v1 +namespace profiling { + using detail::d1::set_name; +} +} // namespace tbb + +#include "third_party/tbb/detail/_rtm_mutex.h" + +namespace tbb { +inline namespace v1 { +#if __TBB_TSX_INTRINSICS_PRESENT + using speculative_spin_mutex = detail::d1::rtm_mutex; +#else + using speculative_spin_mutex = detail::d1::spin_mutex; +#endif +} +} + +#endif /* __TBB_spin_mutex_H */ + diff --git a/third_party/tbb/spin_rw_mutex.h b/third_party/tbb/spin_rw_mutex.h new file mode 100644 index 000000000..71cbdf7ec --- /dev/null +++ b/third_party/tbb/spin_rw_mutex.h @@ -0,0 +1,230 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_spin_rw_mutex_H +#define __TBB_spin_rw_mutex_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_mutex_common.h" + +#include "third_party/tbb/profiling.h" + +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_scoped_lock.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace d1 { + +#if __TBB_TSX_INTRINSICS_PRESENT +class rtm_rw_mutex; +#endif + +//! Fast, unfair, spinning reader-writer lock with backoff and writer-preference +/** @ingroup synchronization */ +class spin_rw_mutex { +public: + //! Constructors + spin_rw_mutex() noexcept : m_state(0) { + create_itt_sync(this, "tbb::spin_rw_mutex", ""); + } + + //! Destructor + ~spin_rw_mutex() { + __TBB_ASSERT(!m_state, "destruction of an acquired mutex"); + } + + //! No Copy + spin_rw_mutex(const spin_rw_mutex&) = delete; + spin_rw_mutex& operator=(const spin_rw_mutex&) = delete; + + using scoped_lock = rw_scoped_lock; + + //! Mutex traits + static constexpr bool is_rw_mutex = true; + static constexpr bool is_recursive_mutex = false; + static constexpr bool is_fair_mutex = false; + + //! Acquire lock + void lock() { + call_itt_notify(prepare, this); + for (atomic_backoff backoff; ; backoff.pause()) { + state_type s = m_state.load(std::memory_order_relaxed); + if (!(s & BUSY)) { // no readers, no writers + if (m_state.compare_exchange_strong(s, WRITER)) + break; // successfully stored writer flag + backoff.reset(); // we could be very close to complete op. + } else if (!(s & WRITER_PENDING)) { // no pending writers + m_state |= WRITER_PENDING; + } + } + call_itt_notify(acquired, this); + } + + //! Try acquiring lock (non-blocking) + /** Return true if lock acquired; false otherwise. */ + bool try_lock() { + // for a writer: only possible to acquire if no active readers or writers + state_type s = m_state.load(std::memory_order_relaxed); + if (!(s & BUSY)) { // no readers, no writers; mask is 1..1101 + if (m_state.compare_exchange_strong(s, WRITER)) { + call_itt_notify(acquired, this); + return true; // successfully stored writer flag + } + } + return false; + } + + //! Release lock + void unlock() { + call_itt_notify(releasing, this); + m_state &= READERS; + } + + //! Lock shared ownership mutex + void lock_shared() { + call_itt_notify(prepare, this); + for (atomic_backoff b; ; b.pause()) { + state_type s = m_state.load(std::memory_order_relaxed); + if (!(s & (WRITER | WRITER_PENDING))) { // no writer or write requests + state_type prev_state = m_state.fetch_add(ONE_READER); + if (!(prev_state & WRITER)) { + break; // successfully stored increased number of readers + } + // writer got there first, undo the increment + m_state -= ONE_READER; + } + } + call_itt_notify(acquired, this); + __TBB_ASSERT(m_state & READERS, "invalid state of a read lock: no readers"); + } + + //! Try lock shared ownership mutex + bool try_lock_shared() { + // for a reader: acquire if no active or waiting writers + state_type s = m_state.load(std::memory_order_relaxed); + if (!(s & (WRITER | WRITER_PENDING))) { // no writers + state_type prev_state = m_state.fetch_add(ONE_READER); + if (!(prev_state & WRITER)) { // got the lock + call_itt_notify(acquired, this); + return true; // successfully stored increased number of readers + } + // writer got there first, undo the increment + m_state -= ONE_READER; + } + return false; + } + + //! Unlock shared ownership mutex + void unlock_shared() { + __TBB_ASSERT(m_state & READERS, "invalid state of a read lock: no readers"); + call_itt_notify(releasing, this); + m_state -= ONE_READER; + } + +protected: + /** Internal non ISO C++ standard API **/ + //! This API is used through the scoped_lock class + + //! Upgrade reader to become a writer. + /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ + bool upgrade() { + state_type s = m_state.load(std::memory_order_relaxed); + __TBB_ASSERT(s & READERS, "invalid state before upgrade: no readers "); + // Check and set writer-pending flag. + // Required conditions: either no pending writers, or we are the only reader + // (with multiple readers and pending writer, another upgrade could have been requested) + while ((s & READERS) == ONE_READER || !(s & WRITER_PENDING)) { + if (m_state.compare_exchange_strong(s, s | WRITER | WRITER_PENDING)) { + atomic_backoff backoff; + while ((m_state.load(std::memory_order_relaxed) & READERS) != ONE_READER) backoff.pause(); + __TBB_ASSERT((m_state & (WRITER_PENDING|WRITER)) == (WRITER_PENDING | WRITER), "invalid state when upgrading to writer"); + // Both new readers and writers are blocked at this time + m_state -= (ONE_READER + WRITER_PENDING); + return true; // successfully upgraded + } + } + // Slow reacquire + unlock_shared(); + lock(); + return false; + } + + //! Downgrade writer to a reader + void downgrade() { + call_itt_notify(releasing, this); + m_state += (ONE_READER - WRITER); + __TBB_ASSERT(m_state & READERS, "invalid state after downgrade: no readers"); + } + + using state_type = std::intptr_t; + static constexpr state_type WRITER = 1; + static constexpr state_type WRITER_PENDING = 2; + static constexpr state_type READERS = ~(WRITER | WRITER_PENDING); + static constexpr state_type ONE_READER = 4; + static constexpr state_type BUSY = WRITER | READERS; + friend scoped_lock; + //! State of lock + /** Bit 0 = writer is holding lock + Bit 1 = request by a writer to acquire lock (hint to readers to wait) + Bit 2..N = number of readers holding lock */ + std::atomic m_state; +}; // class spin_rw_mutex + +#if TBB_USE_PROFILING_TOOLS +inline void set_name(spin_rw_mutex& obj, const char* name) { + itt_set_sync_name(&obj, name); +} +#if (_WIN32||_WIN64) +inline void set_name(spin_rw_mutex& obj, const wchar_t* name) { + itt_set_sync_name(&obj, name); +} +#endif // WIN +#else +inline void set_name(spin_rw_mutex&, const char*) {} +#if (_WIN32||_WIN64) +inline void set_name(spin_rw_mutex&, const wchar_t*) {} +#endif // WIN +#endif +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::spin_rw_mutex; +} // namespace v1 +namespace profiling { + using detail::d1::set_name; +} +} // namespace tbb + +#include "third_party/tbb/detail/_rtm_rw_mutex.h" + +namespace tbb { +inline namespace v1 { +#if __TBB_TSX_INTRINSICS_PRESENT + using speculative_spin_rw_mutex = detail::d1::rtm_rw_mutex; +#else + using speculative_spin_rw_mutex = detail::d1::spin_rw_mutex; +#endif +} +} + +#endif /* __TBB_spin_rw_mutex_H */ + diff --git a/third_party/tbb/task.cpp b/third_party/tbb/task.cpp new file mode 100644 index 000000000..c40017376 --- /dev/null +++ b/third_party/tbb/task.cpp @@ -0,0 +1,228 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Do not include task.h directly. Use scheduler_common.h instead +#include "third_party/tbb/scheduler_common.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/arena.h" +#include "third_party/tbb/thread_data.h" +#include "third_party/tbb/task_dispatcher.h" +#include "third_party/tbb/waiters.h" +#include "third_party/tbb/itt_notify.h" + +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/partitioner.h" +#include "third_party/tbb/task.h" + +#include "third_party/libcxx/cstring" + +namespace tbb { +namespace detail { +namespace r1 { + +//------------------------------------------------------------------------ +// resumable tasks +//------------------------------------------------------------------------ +#if __TBB_RESUMABLE_TASKS + +void suspend(suspend_callback_type suspend_callback, void* user_callback) { + thread_data& td = *governor::get_thread_data(); + td.my_task_dispatcher->suspend(suspend_callback, user_callback); + // Do not access td after suspend. +} + +void resume(suspend_point_type* sp) { + assert_pointers_valid(sp, sp->m_arena); + task_dispatcher& task_disp = sp->m_resume_task.m_target; + + if (sp->try_notify_resume()) { + // TODO: remove this work-around + // Prolong the arena's lifetime while all coroutines are alive + // (otherwise the arena can be destroyed while some tasks are suspended). + arena& a = *sp->m_arena; + a.my_references += arena::ref_worker; + + if (task_disp.m_properties.critical_task_allowed) { + // The target is not in the process of executing critical task, so the resume task is not critical. + a.my_resume_task_stream.push(&sp->m_resume_task, random_lane_selector(sp->m_random)); + } else { + #if __TBB_PREVIEW_CRITICAL_TASKS + // The target is in the process of executing critical task, so the resume task is critical. + a.my_critical_task_stream.push(&sp->m_resume_task, random_lane_selector(sp->m_random)); + #endif + } + // Do not access target after that point. + a.advertise_new_work(); + // Release our reference to my_arena. + a.on_thread_leaving(arena::ref_worker); + } + +} + +suspend_point_type* current_suspend_point() { + thread_data& td = *governor::get_thread_data(); + return td.my_task_dispatcher->get_suspend_point(); +} + +task_dispatcher& create_coroutine(thread_data& td) { + // We may have some task dispatchers cached + task_dispatcher* task_disp = td.my_arena->my_co_cache.pop(); + if (!task_disp) { + void* ptr = cache_aligned_allocate(sizeof(task_dispatcher)); + task_disp = new(ptr) task_dispatcher(td.my_arena); + task_disp->init_suspend_point(td.my_arena, td.my_arena->my_threading_control->worker_stack_size()); + } + // Prolong the arena's lifetime until all coroutines is alive + // (otherwise the arena can be destroyed while some tasks are suspended). + // TODO: consider behavior if there are more than 4K external references. + td.my_arena->my_references += arena::ref_external; + return *task_disp; +} + +void task_dispatcher::internal_suspend() { + __TBB_ASSERT(m_thread_data != nullptr, nullptr); + + arena_slot* slot = m_thread_data->my_arena_slot; + __TBB_ASSERT(slot != nullptr, nullptr); + + task_dispatcher& default_task_disp = slot->default_task_dispatcher(); + // TODO: simplify the next line, e.g. is_task_dispatcher_recalled( task_dispatcher& ) + bool is_recalled = default_task_disp.get_suspend_point()->m_is_owner_recalled.load(std::memory_order_acquire); + task_dispatcher& target = is_recalled ? default_task_disp : create_coroutine(*m_thread_data); + + resume(target); + + if (m_properties.outermost) { + recall_point(); + } +} + +void task_dispatcher::suspend(suspend_callback_type suspend_callback, void* user_callback) { + __TBB_ASSERT(suspend_callback != nullptr, nullptr); + __TBB_ASSERT(user_callback != nullptr, nullptr); + suspend_callback(user_callback, get_suspend_point()); + + __TBB_ASSERT(m_thread_data != nullptr, nullptr); + __TBB_ASSERT(m_thread_data->my_post_resume_action == post_resume_action::none, nullptr); + __TBB_ASSERT(m_thread_data->my_post_resume_arg == nullptr, nullptr); + internal_suspend(); +} + +bool task_dispatcher::resume(task_dispatcher& target) { + // Do not create non-trivial objects on the stack of this function. They might never be destroyed + { + thread_data* td = m_thread_data; + __TBB_ASSERT(&target != this, "We cannot resume to ourself"); + __TBB_ASSERT(td != nullptr, "This task dispatcher must be attach to a thread data"); + __TBB_ASSERT(td->my_task_dispatcher == this, "Thread data must be attached to this task dispatcher"); + + // Change the task dispatcher + td->detach_task_dispatcher(); + td->attach_task_dispatcher(target); + } + __TBB_ASSERT(m_suspend_point != nullptr, "Suspend point must be created"); + __TBB_ASSERT(target.m_suspend_point != nullptr, "Suspend point must be created"); + // Swap to the target coroutine. + + m_suspend_point->resume(target.m_suspend_point); + // Pay attention that m_thread_data can be changed after resume + if (m_thread_data) { + thread_data* td = m_thread_data; + __TBB_ASSERT(td != nullptr, "This task dispatcher must be attach to a thread data"); + __TBB_ASSERT(td->my_task_dispatcher == this, "Thread data must be attached to this task dispatcher"); + do_post_resume_action(); + + // Remove the recall flag if the thread in its original task dispatcher + arena_slot* slot = td->my_arena_slot; + __TBB_ASSERT(slot != nullptr, nullptr); + if (this == slot->my_default_task_dispatcher) { + __TBB_ASSERT(m_suspend_point != nullptr, nullptr); + m_suspend_point->m_is_owner_recalled.store(false, std::memory_order_relaxed); + } + return true; + } + return false; +} + +void task_dispatcher::do_post_resume_action() { + thread_data* td = m_thread_data; + switch (td->my_post_resume_action) { + case post_resume_action::register_waiter: + { + __TBB_ASSERT(td->my_post_resume_arg, "The post resume action must have an argument"); + static_cast(td->my_post_resume_arg)->notify(); + break; + } + case post_resume_action::cleanup: + { + __TBB_ASSERT(td->my_post_resume_arg, "The post resume action must have an argument"); + task_dispatcher* to_cleanup = static_cast(td->my_post_resume_arg); + // Release coroutine's reference to my_arena + td->my_arena->on_thread_leaving(arena::ref_external); + // Cache the coroutine for possible later re-usage + td->my_arena->my_co_cache.push(to_cleanup); + break; + } + case post_resume_action::notify: + { + __TBB_ASSERT(td->my_post_resume_arg, "The post resume action must have an argument"); + suspend_point_type* sp = static_cast(td->my_post_resume_arg); + sp->recall_owner(); + // Do not access sp because it can be destroyed after recall + + auto is_our_suspend_point = [sp] (market_context ctx) { + return std::uintptr_t(sp) == ctx.my_uniq_addr; + }; + td->my_arena->get_waiting_threads_monitor().notify(is_our_suspend_point); + break; + } + default: + __TBB_ASSERT(td->my_post_resume_action == post_resume_action::none, "Unknown post resume action"); + __TBB_ASSERT(td->my_post_resume_arg == nullptr, "The post resume argument should not be set"); + } + td->clear_post_resume_action(); +} + +#else + +void suspend(suspend_callback_type, void*) { + __TBB_ASSERT_RELEASE(false, "Resumable tasks are unsupported on this platform"); +} + +void resume(suspend_point_type*) { + __TBB_ASSERT_RELEASE(false, "Resumable tasks are unsupported on this platform"); +} + +suspend_point_type* current_suspend_point() { + __TBB_ASSERT_RELEASE(false, "Resumable tasks are unsupported on this platform"); + return nullptr; +} + +#endif /* __TBB_RESUMABLE_TASKS */ + +void notify_waiters(std::uintptr_t wait_ctx_addr) { + auto is_related_wait_ctx = [&] (market_context context) { + return wait_ctx_addr == context.my_uniq_addr; + }; + + governor::get_thread_data()->my_arena->get_waiting_threads_monitor().notify(is_related_wait_ctx); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb + diff --git a/third_party/tbb/task.h b/third_party/tbb/task.h new file mode 100644 index 000000000..691c18341 --- /dev/null +++ b/third_party/tbb/task.h @@ -0,0 +1,38 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_task_H +#define __TBB_task_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_task.h" + +namespace tbb { +inline namespace v1 { +namespace task { +#if __TBB_RESUMABLE_TASKS + using detail::d1::suspend_point; + using detail::d1::resume; + using detail::d1::suspend; +#endif /* __TBB_RESUMABLE_TASKS */ + using detail::d1::current_context; +} // namespace task +} // namespace v1 +} // namespace tbb + +#endif /* __TBB_task_H */ diff --git a/third_party/tbb/task_arena.h b/third_party/tbb/task_arena.h new file mode 100644 index 000000000..2b3fbda53 --- /dev/null +++ b/third_party/tbb/task_arena.h @@ -0,0 +1,500 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_task_arena_H +#define __TBB_task_arena_H + +#include "third_party/tbb/detail/_config.h" + +#include "third_party/tbb/detail/_aligned_space.h" +#include "third_party/tbb/detail/_attach.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_small_object_pool.h" +#include "third_party/tbb/detail/_task.h" + +#include "third_party/tbb/detail/_task_handle.h" + +#if __TBB_ARENA_BINDING +#include "third_party/tbb/info.h" +#endif /*__TBB_ARENA_BINDING*/ + +namespace tbb { +namespace detail { + +namespace d1 { + +template +class task_arena_function : public delegate_base { + F &my_func; + aligned_space my_return_storage; + bool my_constructed{false}; + // The function should be called only once. + bool operator()() const override { + new (my_return_storage.begin()) R(my_func()); + return true; + } +public: + task_arena_function(F& f) : my_func(f) {} + // The function can be called only after operator() and only once. + R consume_result() { + my_constructed = true; + return std::move(*(my_return_storage.begin())); + } + ~task_arena_function() override { + if (my_constructed) { + my_return_storage.begin()->~R(); + } + } +}; + +template +class task_arena_function : public delegate_base { + F &my_func; + bool operator()() const override { + my_func(); + return true; + } +public: + task_arena_function(F& f) : my_func(f) {} + void consume_result() const {} + + friend class task_arena_base; +}; + +class task_arena_base; +class task_scheduler_observer; +} // namespace d1 + +namespace r1 { +class arena; +struct task_arena_impl; + +TBB_EXPORT void __TBB_EXPORTED_FUNC observe(d1::task_scheduler_observer&, bool); +TBB_EXPORT void __TBB_EXPORTED_FUNC initialize(d1::task_arena_base&); +TBB_EXPORT void __TBB_EXPORTED_FUNC terminate(d1::task_arena_base&); +TBB_EXPORT bool __TBB_EXPORTED_FUNC attach(d1::task_arena_base&); +TBB_EXPORT void __TBB_EXPORTED_FUNC execute(d1::task_arena_base&, d1::delegate_base&); +TBB_EXPORT void __TBB_EXPORTED_FUNC wait(d1::task_arena_base&); +TBB_EXPORT int __TBB_EXPORTED_FUNC max_concurrency(const d1::task_arena_base*); +TBB_EXPORT void __TBB_EXPORTED_FUNC isolate_within_arena(d1::delegate_base& d, std::intptr_t); + +TBB_EXPORT void __TBB_EXPORTED_FUNC enqueue(d1::task&, d1::task_arena_base*); +TBB_EXPORT void __TBB_EXPORTED_FUNC enqueue(d1::task&, d1::task_group_context&, d1::task_arena_base*); +TBB_EXPORT void __TBB_EXPORTED_FUNC submit(d1::task&, d1::task_group_context&, arena*, std::uintptr_t); +} // namespace r1 + +namespace d2 { +inline void enqueue_impl(task_handle&& th, d1::task_arena_base* ta) { + __TBB_ASSERT(th != nullptr, "Attempt to schedule empty task_handle"); + + auto& ctx = task_handle_accessor::ctx_of(th); + + // Do not access th after release + r1::enqueue(*task_handle_accessor::release(th), ctx, ta); +} +} //namespace d2 + +namespace d1 { + +static constexpr unsigned num_priority_levels = 3; +static constexpr int priority_stride = INT_MAX / (num_priority_levels + 1); + +class task_arena_base { + friend struct r1::task_arena_impl; + friend void r1::observe(d1::task_scheduler_observer&, bool); +public: + enum class priority : int { + low = 1 * priority_stride, + normal = 2 * priority_stride, + high = 3 * priority_stride + }; +#if __TBB_ARENA_BINDING + using constraints = tbb::detail::d1::constraints; +#endif /*__TBB_ARENA_BINDING*/ +protected: + //! Special settings + intptr_t my_version_and_traits; + + std::atomic my_initialization_state; + + //! nullptr if not currently initialized. + std::atomic my_arena; + static_assert(sizeof(std::atomic) == sizeof(r1::arena*), + "To preserve backward compatibility we need the equal size of an atomic pointer and a pointer"); + + //! Concurrency level for deferred initialization + int my_max_concurrency; + + //! Reserved slots for external threads + unsigned my_num_reserved_slots; + + //! Arena priority + priority my_priority; + + //! The NUMA node index to which the arena will be attached + numa_node_id my_numa_id; + + //! The core type index to which arena will be attached + core_type_id my_core_type; + + //! Number of threads per core + int my_max_threads_per_core; + + // Backward compatibility checks. + core_type_id core_type() const { + return (my_version_and_traits & core_type_support_flag) == core_type_support_flag ? my_core_type : automatic; + } + int max_threads_per_core() const { + return (my_version_and_traits & core_type_support_flag) == core_type_support_flag ? my_max_threads_per_core : automatic; + } + + enum { + default_flags = 0 + , core_type_support_flag = 1 + }; + + task_arena_base(int max_concurrency, unsigned reserved_for_masters, priority a_priority) + : my_version_and_traits(default_flags | core_type_support_flag) + , my_initialization_state(do_once_state::uninitialized) + , my_arena(nullptr) + , my_max_concurrency(max_concurrency) + , my_num_reserved_slots(reserved_for_masters) + , my_priority(a_priority) + , my_numa_id(automatic) + , my_core_type(automatic) + , my_max_threads_per_core(automatic) + {} + +#if __TBB_ARENA_BINDING + task_arena_base(const constraints& constraints_, unsigned reserved_for_masters, priority a_priority) + : my_version_and_traits(default_flags | core_type_support_flag) + , my_initialization_state(do_once_state::uninitialized) + , my_arena(nullptr) + , my_max_concurrency(constraints_.max_concurrency) + , my_num_reserved_slots(reserved_for_masters) + , my_priority(a_priority) + , my_numa_id(constraints_.numa_id) + , my_core_type(constraints_.core_type) + , my_max_threads_per_core(constraints_.max_threads_per_core) + {} +#endif /*__TBB_ARENA_BINDING*/ +public: + //! Typedef for number of threads that is automatic. + static const int automatic = -1; + static const int not_initialized = -2; +}; + +template +R isolate_impl(F& f) { + task_arena_function func(f); + r1::isolate_within_arena(func, /*isolation*/ 0); + return func.consume_result(); +} + +template +class enqueue_task : public task { + small_object_allocator m_allocator; + const F m_func; + + void finalize(const execution_data& ed) { + m_allocator.delete_object(this, ed); + } + task* execute(execution_data& ed) override { + m_func(); + finalize(ed); + return nullptr; + } + task* cancel(execution_data&) override { + __TBB_ASSERT_RELEASE(false, "Unhandled exception from enqueue task is caught"); + return nullptr; + } +public: + enqueue_task(const F& f, small_object_allocator& alloc) : m_allocator(alloc), m_func(f) {} + enqueue_task(F&& f, small_object_allocator& alloc) : m_allocator(alloc), m_func(std::move(f)) {} +}; + +template +void enqueue_impl(F&& f, task_arena_base* ta) { + small_object_allocator alloc{}; + r1::enqueue(*alloc.new_object::type>>(std::forward(f), alloc), ta); +} +/** 1-to-1 proxy representation class of scheduler's arena + * Constructors set up settings only, real construction is deferred till the first method invocation + * Destructor only removes one of the references to the inner arena representation. + * Final destruction happens when all the references (and the work) are gone. + */ +class task_arena : public task_arena_base { + + void mark_initialized() { + __TBB_ASSERT( my_arena.load(std::memory_order_relaxed), "task_arena initialization is incomplete" ); + my_initialization_state.store(do_once_state::initialized, std::memory_order_release); + } + + template + R execute_impl(F& f) { + initialize(); + task_arena_function func(f); + r1::execute(*this, func); + return func.consume_result(); + } +public: + //! Creates task_arena with certain concurrency limits + /** Sets up settings only, real construction is deferred till the first method invocation + * @arg max_concurrency specifies total number of slots in arena where threads work + * @arg reserved_for_masters specifies number of slots to be used by external threads only. + * Value of 1 is default and reflects behavior of implicit arenas. + **/ + task_arena(int max_concurrency_ = automatic, unsigned reserved_for_masters = 1, + priority a_priority = priority::normal) + : task_arena_base(max_concurrency_, reserved_for_masters, a_priority) + {} + +#if __TBB_ARENA_BINDING + //! Creates task arena pinned to certain NUMA node + task_arena(const constraints& constraints_, unsigned reserved_for_masters = 1, + priority a_priority = priority::normal) + : task_arena_base(constraints_, reserved_for_masters, a_priority) + {} + + //! Copies settings from another task_arena + task_arena(const task_arena &s) // copy settings but not the reference or instance + : task_arena_base( + constraints{} + .set_numa_id(s.my_numa_id) + .set_max_concurrency(s.my_max_concurrency) + .set_core_type(s.my_core_type) + .set_max_threads_per_core(s.my_max_threads_per_core) + , s.my_num_reserved_slots, s.my_priority) + {} +#else + //! Copies settings from another task_arena + task_arena(const task_arena& a) // copy settings but not the reference or instance + : task_arena_base(a.my_max_concurrency, a.my_num_reserved_slots, a.my_priority) + {} +#endif /*__TBB_ARENA_BINDING*/ + + //! Tag class used to indicate the "attaching" constructor + struct attach {}; + + //! Creates an instance of task_arena attached to the current arena of the thread + explicit task_arena( attach ) + : task_arena_base(automatic, 1, priority::normal) // use default settings if attach fails + { + if (r1::attach(*this)) { + mark_initialized(); + } + } + + //! Creates an instance of task_arena attached to the current arena of the thread + explicit task_arena(d1::attach) + : task_arena(attach{}) + {} + + //! Forces allocation of the resources for the task_arena as specified in constructor arguments + void initialize() { + atomic_do_once([this]{ r1::initialize(*this); }, my_initialization_state); + } + + //! Overrides concurrency level and forces initialization of internal representation + void initialize(int max_concurrency_, unsigned reserved_for_masters = 1, + priority a_priority = priority::normal) + { + __TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena"); + if( !is_active() ) { + my_max_concurrency = max_concurrency_; + my_num_reserved_slots = reserved_for_masters; + my_priority = a_priority; + r1::initialize(*this); + mark_initialized(); + } + } + +#if __TBB_ARENA_BINDING + void initialize(constraints constraints_, unsigned reserved_for_masters = 1, + priority a_priority = priority::normal) + { + __TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena"); + if( !is_active() ) { + my_numa_id = constraints_.numa_id; + my_max_concurrency = constraints_.max_concurrency; + my_core_type = constraints_.core_type; + my_max_threads_per_core = constraints_.max_threads_per_core; + my_num_reserved_slots = reserved_for_masters; + my_priority = a_priority; + r1::initialize(*this); + mark_initialized(); + } + } +#endif /*__TBB_ARENA_BINDING*/ + + //! Attaches this instance to the current arena of the thread + void initialize(attach) { + // TODO: decide if this call must be thread-safe + __TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena"); + if( !is_active() ) { + if ( !r1::attach(*this) ) { + r1::initialize(*this); + } + mark_initialized(); + } + } + + //! Attaches this instance to the current arena of the thread + void initialize(d1::attach) { + initialize(attach{}); + } + + //! Removes the reference to the internal arena representation. + //! Not thread safe wrt concurrent invocations of other methods. + void terminate() { + if( is_active() ) { + r1::terminate(*this); + my_initialization_state.store(do_once_state::uninitialized, std::memory_order_relaxed); + } + } + + //! Removes the reference to the internal arena representation, and destroys the external object. + //! Not thread safe wrt concurrent invocations of other methods. + ~task_arena() { + terminate(); + } + + //! Returns true if the arena is active (initialized); false otherwise. + //! The name was chosen to match a task_scheduler_init method with the same semantics. + bool is_active() const { + return my_initialization_state.load(std::memory_order_acquire) == do_once_state::initialized; + } + + //! Enqueues a task into the arena to process a functor, and immediately returns. + //! Does not require the calling thread to join the arena + + template + void enqueue(F&& f) { + initialize(); + enqueue_impl(std::forward(f), this); + } + + //! Enqueues a task into the arena to process a functor wrapped in task_handle, and immediately returns. + //! Does not require the calling thread to join the arena + void enqueue(d2::task_handle&& th) { + initialize(); + d2::enqueue_impl(std::move(th), this); + } + + //! Joins the arena and executes a mutable functor, then returns + //! If not possible to join, wraps the functor into a task, enqueues it and waits for task completion + //! Can decrement the arena demand for workers, causing a worker to leave and free a slot to the calling thread + //! Since C++11, the method returns the value returned by functor (prior to C++11 it returns void). + template + auto execute(F&& f) -> decltype(f()) { + return execute_impl(f); + } + +#if __TBB_EXTRA_DEBUG + //! Returns my_num_reserved_slots + int debug_reserved_slots() const { + // Handle special cases inside the library + return my_num_reserved_slots; + } + + //! Returns my_max_concurrency + int debug_max_concurrency() const { + // Handle special cases inside the library + return my_max_concurrency; + } + + //! Wait for all work in the arena to be completed + //! Even submitted by other application threads + //! Joins arena if/when possible (in the same way as execute()) + void debug_wait_until_empty() { + initialize(); + r1::wait(*this); + } +#endif //__TBB_EXTRA_DEBUG + + //! Returns the maximal number of threads that can work inside the arena + int max_concurrency() const { + // Handle special cases inside the library + return (my_max_concurrency > 1) ? my_max_concurrency : r1::max_concurrency(this); + } + + friend void submit(task& t, task_arena& ta, task_group_context& ctx, bool as_critical) { + __TBB_ASSERT(ta.is_active(), nullptr); + call_itt_task_notify(releasing, &t); + r1::submit(t, ctx, ta.my_arena.load(std::memory_order_relaxed), as_critical ? 1 : 0); + } +}; + +//! Executes a mutable functor in isolation within the current task arena. +//! Since C++11, the method returns the value returned by functor (prior to C++11 it returns void). +template +inline auto isolate(F&& f) -> decltype(f()) { + return isolate_impl(f); +} + +//! Returns the index, aka slot number, of the calling thread in its current arena +inline int current_thread_index() { + slot_id idx = r1::execution_slot(nullptr); + return idx == slot_id(-1) ? task_arena_base::not_initialized : int(idx); +} + +#if __TBB_PREVIEW_TASK_GROUP_EXTENSIONS +inline bool is_inside_task() { + return nullptr != current_context(); +} +#endif //__TBB_PREVIEW_TASK_GROUP_EXTENSIONS + +//! Returns the maximal number of threads that can work inside the arena +inline int max_concurrency() { + return r1::max_concurrency(nullptr); +} + +inline void enqueue(d2::task_handle&& th) { + d2::enqueue_impl(std::move(th), nullptr); +} + +template +inline void enqueue(F&& f) { + enqueue_impl(std::forward(f), nullptr); +} + +using r1::submit; + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::task_arena; +using detail::d1::attach; + +#if __TBB_PREVIEW_TASK_GROUP_EXTENSIONS +using detail::d1::is_inside_task; +#endif + +namespace this_task_arena { +using detail::d1::current_thread_index; +using detail::d1::max_concurrency; +using detail::d1::isolate; + +using detail::d1::enqueue; +} // namespace this_task_arena + +} // inline namespace v1 + +} // namespace tbb +#endif /* __TBB_task_arena_H */ diff --git a/third_party/tbb/task_dispatcher.cpp b/third_party/tbb/task_dispatcher.cpp new file mode 100644 index 000000000..0ab1cb9ac --- /dev/null +++ b/third_party/tbb/task_dispatcher.cpp @@ -0,0 +1,245 @@ +// clang-format off +/* + Copyright (c) 2020-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/task_dispatcher.h" +#include "third_party/tbb/waiters.h" + +namespace tbb { +namespace detail { +namespace r1 { + +static inline void spawn_and_notify(d1::task& t, arena_slot* slot, arena* a) { + slot->spawn(t); + a->advertise_new_work(); + // TODO: TBB_REVAMP_TODO slot->assert_task_pool_valid(); +} + +void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx) { + thread_data* tls = governor::get_thread_data(); + task_group_context_impl::bind_to(ctx, tls); + arena* a = tls->my_arena; + arena_slot* slot = tls->my_arena_slot; + // Capture current context + task_accessor::context(t) = &ctx; + // Mark isolation + task_accessor::isolation(t) = tls->my_task_dispatcher->m_execute_data_ext.isolation; + spawn_and_notify(t, slot, a); +} + +void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx, d1::slot_id id) { + thread_data* tls = governor::get_thread_data(); + task_group_context_impl::bind_to(ctx, tls); + arena* a = tls->my_arena; + arena_slot* slot = tls->my_arena_slot; + execution_data_ext& ed = tls->my_task_dispatcher->m_execute_data_ext; + + // Capture context + task_accessor::context(t) = &ctx; + // Mark isolation + task_accessor::isolation(t) = ed.isolation; + + if ( id != d1::no_slot && id != tls->my_arena_index && id < a->my_num_slots) { + // Allocate proxy task + d1::small_object_allocator alloc{}; + auto proxy = alloc.new_object(static_cast(ed)); + // Mark as a proxy + task_accessor::set_proxy_trait(*proxy); + // Mark isolation for the proxy task + task_accessor::isolation(*proxy) = ed.isolation; + // Deallocation hint (tls) from the task allocator + proxy->allocator = alloc; + proxy->slot = id; + proxy->outbox = &a->mailbox(id); + // Mark proxy as present in both locations (sender's task pool and destination mailbox) + proxy->task_and_tag = intptr_t(&t) | task_proxy::location_mask; + // Mail the proxy - after this point t may be destroyed by another thread at any moment. + proxy->outbox->push(proxy); + // Spawn proxy to the local task pool + spawn_and_notify(*proxy, slot, a); + } else { + spawn_and_notify(t, slot, a); + } +} + +void __TBB_EXPORTED_FUNC submit(d1::task& t, d1::task_group_context& ctx, arena* a, std::uintptr_t as_critical) { + suppress_unused_warning(as_critical); + assert_pointer_valid(a); + thread_data& tls = *governor::get_thread_data(); + + // TODO revamp: for each use case investigate neccesity to make this call + task_group_context_impl::bind_to(ctx, &tls); + task_accessor::context(t) = &ctx; + // TODO revamp: consider respecting task isolation if this call is being made by external thread + task_accessor::isolation(t) = tls.my_task_dispatcher->m_execute_data_ext.isolation; + + // TODO: consider code refactoring when lane selection mechanism is unified. + + if ( tls.is_attached_to(a) ) { + arena_slot* slot = tls.my_arena_slot; +#if __TBB_PREVIEW_CRITICAL_TASKS + if( as_critical ) { + a->my_critical_task_stream.push( &t, subsequent_lane_selector(slot->critical_hint()) ); + } else +#endif + { + slot->spawn(t); + } + } else { + random_lane_selector lane_selector{tls.my_random}; +#if !__TBB_PREVIEW_CRITICAL_TASKS + suppress_unused_warning(as_critical); +#else + if ( as_critical ) { + a->my_critical_task_stream.push( &t, lane_selector ); + } else +#endif + { + // Avoid joining the arena the thread is not currently in. + a->my_fifo_task_stream.push( &t, lane_selector ); + } + } + // It is assumed that some thread will explicitly wait in the arena the task is submitted + // into. Therefore, no need to utilize mandatory concurrency here. + a->advertise_new_work(); +} + +void __TBB_EXPORTED_FUNC execute_and_wait(d1::task& t, d1::task_group_context& t_ctx, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) { + task_accessor::context(t) = &t_ctx; + task_dispatcher::execute_and_wait(&t, wait_ctx, w_ctx); +} + +void __TBB_EXPORTED_FUNC wait(d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) { + // Enter the task dispatch loop without a task + task_dispatcher::execute_and_wait(nullptr, wait_ctx, w_ctx); +} + +d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::execution_data* ed) { + if (ed) { + const execution_data_ext* ed_ext = static_cast(ed); + assert_pointers_valid(ed_ext->task_disp, ed_ext->task_disp->m_thread_data); + return ed_ext->task_disp->m_thread_data->my_arena_index; + } else { + thread_data* td = governor::get_thread_data_if_initialized(); + return td ? td->my_arena_index : d1::slot_id(-1); + } +} + +d1::task_group_context* __TBB_EXPORTED_FUNC current_context() { + thread_data* td = governor::get_thread_data(); + assert_pointers_valid(td, td->my_task_dispatcher); + + task_dispatcher* task_disp = td->my_task_dispatcher; + if (task_disp->m_properties.outermost) { + // No one task is executed, so no execute_data. + return nullptr; + } else { + return td->my_task_dispatcher->m_execute_data_ext.context; + } +} + +void task_dispatcher::execute_and_wait(d1::task* t, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) { + // Get an associated task dispatcher + thread_data* tls = governor::get_thread_data(); + __TBB_ASSERT(tls->my_task_dispatcher != nullptr, nullptr); + task_dispatcher& local_td = *tls->my_task_dispatcher; + + // TODO: factor out the binding to execute_and_wait_impl + if (t) { + task_group_context_impl::bind_to(*task_accessor::context(*t), tls); + // Propagate the isolation to the task executed without spawn. + task_accessor::isolation(*t) = tls->my_task_dispatcher->m_execute_data_ext.isolation; + } + + // Waiting on special object tied to a waiting thread. + external_waiter waiter{ *tls->my_arena, wait_ctx }; + t = local_td.local_wait_for_all(t, waiter); + __TBB_ASSERT_EX(t == nullptr, "External waiter must not leave dispatch loop with a task"); + + // The external thread couldn't exit the dispatch loop in an idle state + if (local_td.m_thread_data->my_inbox.is_idle_state(true)) { + local_td.m_thread_data->my_inbox.set_is_idle(false); + } + + auto exception = w_ctx.my_exception.load(std::memory_order_acquire); + if (exception) { + __TBB_ASSERT(w_ctx.is_group_execution_cancelled(), "The task group context with an exception should be canceled."); + exception->throw_self(); + } +} + +#if __TBB_RESUMABLE_TASKS + +#if _WIN32 +/* [[noreturn]] */ void __stdcall co_local_wait_for_all(void* addr) noexcept +#else +/* [[noreturn]] */ void co_local_wait_for_all(unsigned hi, unsigned lo) noexcept +#endif +{ +#if !_WIN32 + std::uintptr_t addr = lo; + __TBB_ASSERT(sizeof(addr) == 8 || hi == 0, nullptr); + addr += std::uintptr_t(std::uint64_t(hi) << 32); +#endif + task_dispatcher& task_disp = *reinterpret_cast(addr); + assert_pointers_valid(task_disp.m_thread_data, task_disp.m_thread_data->my_arena); + task_disp.set_stealing_threshold(task_disp.m_thread_data->my_arena->calculate_stealing_threshold()); + __TBB_ASSERT(task_disp.can_steal(), nullptr); + task_disp.co_local_wait_for_all(); + // This code is unreachable +} + +/* [[noreturn]] */ void task_dispatcher::co_local_wait_for_all() noexcept { + // Do not create non-trivial objects on the stack of this function. They will never be destroyed. + assert_pointer_valid(m_thread_data); + + m_suspend_point->finilize_resume(); + // Basically calls the user callback passed to the tbb::task::suspend function + do_post_resume_action(); + + // Endless loop here because coroutine could be reused + d1::task* resume_task{}; + do { + arena* a = m_thread_data->my_arena; + coroutine_waiter waiter(*a); + resume_task = local_wait_for_all(nullptr, waiter); + assert_task_valid(resume_task); + __TBB_ASSERT(this == m_thread_data->my_task_dispatcher, nullptr); + + m_thread_data->set_post_resume_action(post_resume_action::cleanup, this); + + } while (resume(static_cast(resume_task)->m_target)); + // This code might be unreachable +} + +d1::suspend_point task_dispatcher::get_suspend_point() { + if (m_suspend_point == nullptr) { + assert_pointer_valid(m_thread_data); + // 0 means that we attach this task dispatcher to the current stack + init_suspend_point(m_thread_data->my_arena, 0); + } + assert_pointer_valid(m_suspend_point); + return m_suspend_point; +} +void task_dispatcher::init_suspend_point(arena* a, std::size_t stack_size) { + __TBB_ASSERT(m_suspend_point == nullptr, nullptr); + m_suspend_point = new(cache_aligned_allocate(sizeof(suspend_point_type))) + suspend_point_type(a, stack_size, *this); +} +#endif /* __TBB_RESUMABLE_TASKS */ +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/task_dispatcher.h b/third_party/tbb/task_dispatcher.h new file mode 100644 index 000000000..4bcbbf66e --- /dev/null +++ b/third_party/tbb/task_dispatcher.h @@ -0,0 +1,469 @@ +// clang-format off +/* + Copyright (c) 2020-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_task_dispatcher_H +#define _TBB_task_dispatcher_H + +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/global_control.h" + +#include "third_party/tbb/scheduler_common.h" +#include "third_party/tbb/waiters.h" +#include "third_party/tbb/arena_slot.h" +#include "third_party/tbb/arena.h" +#include "third_party/tbb/thread_data.h" +#include "third_party/tbb/mailbox.h" +#include "third_party/tbb/itt_notify.h" +#include "third_party/tbb/concurrent_monitor.h" +#include "third_party/tbb/threading_control.h" + +#include "third_party/libcxx/atomic" + +#if !__TBB_CPU_CTL_ENV_PRESENT +#include "libc/runtime/fenv.h" // +#endif + +namespace tbb { +namespace detail { +namespace r1 { + +inline d1::task* get_self_recall_task(arena_slot& slot) { + suppress_unused_warning(slot); + d1::task* t = nullptr; +#if __TBB_RESUMABLE_TASKS + suspend_point_type* sp = slot.default_task_dispatcher().m_suspend_point; + if (sp && sp->m_is_owner_recalled.load(std::memory_order_acquire)) { + t = &sp->m_resume_task; + __TBB_ASSERT(sp->m_resume_task.m_target.m_thread_data == nullptr, nullptr); + } +#endif /* __TBB_RESUMABLE_TASKS */ + return t; +} + +// Defined in exception.cpp +/*[[noreturn]]*/void do_throw_noexcept(void (*throw_exception)()) noexcept; + +//------------------------------------------------------------------------ +// Suspend point +//------------------------------------------------------------------------ +#if __TBB_RESUMABLE_TASKS + +inline d1::task* suspend_point_type::resume_task::execute(d1::execution_data& ed) { + execution_data_ext& ed_ext = static_cast(ed); + + if (ed_ext.wait_ctx) { + thread_control_monitor::resume_context monitor_node{{std::uintptr_t(ed_ext.wait_ctx), nullptr}, ed_ext, m_target}; + // The wait_ctx is present only in external_waiter. In that case we leave the current stack + // in the abandoned state to resume when waiting completes. + thread_data* td = ed_ext.task_disp->m_thread_data; + td->set_post_resume_action(task_dispatcher::post_resume_action::register_waiter, &monitor_node); + + thread_control_monitor& wait_list = td->my_arena->get_waiting_threads_monitor(); + + if (wait_list.wait([&] { return !ed_ext.wait_ctx->continue_execution(); }, monitor_node)) { + return nullptr; + } + + td->clear_post_resume_action(); + r1::resume(ed_ext.task_disp->get_suspend_point()); + } else { + // If wait_ctx is null, it can be only a worker thread on outermost level because + // coroutine_waiter interrupts bypass loop before the resume_task execution. + ed_ext.task_disp->m_thread_data->set_post_resume_action(task_dispatcher::post_resume_action::notify, + ed_ext.task_disp->get_suspend_point()); + } + // Do not access this task because it might be destroyed + ed_ext.task_disp->resume(m_target); + return nullptr; +} + +inline suspend_point_type::suspend_point_type(arena* a, size_t stack_size, task_dispatcher& task_disp) + : m_arena(a) + , m_random(this) + , m_co_context(stack_size, &task_disp) + , m_resume_task(task_disp) +{ + assert_pointer_valid(m_arena); + assert_pointer_valid(m_arena->my_default_ctx); + task_accessor::context(m_resume_task) = m_arena->my_default_ctx; + task_accessor::isolation(m_resume_task) = no_isolation; + // Initialize the itt_caller for the context of the resume task. + // It will be bound to the stack of the first suspend call. + task_group_context_impl::bind_to(*task_accessor::context(m_resume_task), task_disp.m_thread_data); +} + +#endif /* __TBB_RESUMABLE_TASKS */ + +//------------------------------------------------------------------------ +// Task Dispatcher +//------------------------------------------------------------------------ +inline task_dispatcher::task_dispatcher(arena* a) { + m_execute_data_ext.context = a->my_default_ctx; + m_execute_data_ext.task_disp = this; +} + +inline bool task_dispatcher::can_steal() { + __TBB_ASSERT(m_stealing_threshold != 0, nullptr); + stack_anchor_type anchor{}; + return reinterpret_cast(&anchor) > m_stealing_threshold; +} + +inline d1::task* task_dispatcher::get_inbox_or_critical_task( + execution_data_ext& ed, mail_inbox& inbox, isolation_type isolation, bool critical_allowed) +{ + if (inbox.empty()) + return nullptr; + d1::task* result = get_critical_task(nullptr, ed, isolation, critical_allowed); + if (result) + return result; + // Check if there are tasks mailed to this thread via task-to-thread affinity mechanism. + result = get_mailbox_task(inbox, ed, isolation); + // There is a race with a thread adding a new task (possibly with suitable isolation) + // to our mailbox, so the below conditions might result in a false positive. + // Then set_is_idle(false) allows that task to be stolen; it's OK. + if (isolation != no_isolation && !result && !inbox.empty() && inbox.is_idle_state(true)) { + // We have proxy tasks in our mailbox but the isolation blocks their execution. + // So publish the proxy tasks in mailbox to be available for stealing from owner's task pool. + inbox.set_is_idle( false ); + } + return result; +} + +inline d1::task* task_dispatcher::get_stream_or_critical_task( + execution_data_ext& ed, arena& a, task_stream& stream, unsigned& hint, + isolation_type isolation, bool critical_allowed) +{ + if (stream.empty()) + return nullptr; + d1::task* result = get_critical_task(nullptr, ed, isolation, critical_allowed); + if (result) + return result; + return a.get_stream_task(stream, hint); +} + +inline d1::task* task_dispatcher::steal_or_get_critical( + execution_data_ext& ed, arena& a, unsigned arena_index, FastRandom& random, + isolation_type isolation, bool critical_allowed) +{ + if (d1::task* t = a.steal_task(arena_index, random, ed, isolation)) { + ed.context = task_accessor::context(*t); + ed.isolation = task_accessor::isolation(*t); + return get_critical_task(t, ed, isolation, critical_allowed); + } + return nullptr; +} + +template +d1::task* task_dispatcher::receive_or_steal_task( + thread_data& tls, execution_data_ext& ed, Waiter& waiter, isolation_type isolation, + bool fifo_allowed, bool critical_allowed) +{ + __TBB_ASSERT(governor::is_thread_data_set(&tls), nullptr); + // Task to return + d1::task* t = nullptr; + // Get tls data (again) + arena& a = *tls.my_arena; + arena_slot& slot = *tls.my_arena_slot; + unsigned arena_index = tls.my_arena_index; + mail_inbox& inbox = tls.my_inbox; + task_stream& resume_stream = a.my_resume_task_stream; + unsigned& resume_hint = slot.hint_for_resume_stream; + task_stream& fifo_stream = a.my_fifo_task_stream; + unsigned& fifo_hint = slot.hint_for_fifo_stream; + + waiter.reset_wait(); + // Thread is in idle state now + inbox.set_is_idle(true); + + bool stealing_is_allowed = can_steal(); + + // Stealing loop mailbox/enqueue/other_slots + for (;;) { + __TBB_ASSERT(t == nullptr, nullptr); + // Check if the resource manager requires our arena to relinquish some threads + // For the external thread restore idle state to true after dispatch loop + if (!waiter.continue_execution(slot, t)) { + __TBB_ASSERT(t == nullptr, nullptr); + break; + } + // Start searching + if (t != nullptr) { + // continue_execution returned a task + } + else if ((t = get_inbox_or_critical_task(ed, inbox, isolation, critical_allowed))) { + // Successfully got the task from mailbox or critical task + } + else if ((t = get_stream_or_critical_task(ed, a, resume_stream, resume_hint, isolation, critical_allowed))) { + // Successfully got the resume or critical task + } + else if (fifo_allowed && isolation == no_isolation + && (t = get_stream_or_critical_task(ed, a, fifo_stream, fifo_hint, isolation, critical_allowed))) { + // Checked if there are tasks in starvation-resistant stream. Only allowed at the outermost dispatch level without isolation. + } + else if (stealing_is_allowed + && (t = steal_or_get_critical(ed, a, arena_index, tls.my_random, isolation, critical_allowed))) { + // Stole a task from a random arena slot + } + else { + t = get_critical_task(t, ed, isolation, critical_allowed); + } + + if (t != nullptr) { + ed.context = task_accessor::context(*t); + ed.isolation = task_accessor::isolation(*t); + a.my_observers.notify_entry_observers(tls.my_last_observer, tls.my_is_worker); + break; // Stealing success, end of stealing attempt + } + // Nothing to do, pause a little. + waiter.pause(slot); + } // end of nonlocal task retrieval loop + + __TBB_ASSERT(is_alive(a.my_guard), nullptr); + if (inbox.is_idle_state(true)) { + inbox.set_is_idle(false); + } + return t; +} + +template +d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter ) { + assert_pointer_valid(m_thread_data); + __TBB_ASSERT(m_thread_data->my_task_dispatcher == this, nullptr); + + // Guard an outer/default execution state + struct dispatch_loop_guard { + task_dispatcher& task_disp; + execution_data_ext old_execute_data_ext; + properties old_properties; + + ~dispatch_loop_guard() { + task_disp.m_execute_data_ext = old_execute_data_ext; + task_disp.m_properties = old_properties; + + __TBB_ASSERT(task_disp.m_thread_data && governor::is_thread_data_set(task_disp.m_thread_data), nullptr); + __TBB_ASSERT(task_disp.m_thread_data->my_task_dispatcher == &task_disp, nullptr); + } + } dl_guard{ *this, m_execute_data_ext, m_properties }; + + // The context guard to track fp setting and itt tasks. + context_guard_helper context_guard; + + // Current isolation context + const isolation_type isolation = dl_guard.old_execute_data_ext.isolation; + + // Critical work inflection point. Once turned false current execution context has taken + // critical task on the previous stack frame and cannot take more until that critical path is + // finished. + bool critical_allowed = dl_guard.old_properties.critical_task_allowed; + + // Extended execution data that is used for dispatching. + // Base version is passed to the task::execute method. + execution_data_ext& ed = m_execute_data_ext; + ed.context = t ? task_accessor::context(*t) : nullptr; + ed.original_slot = m_thread_data->my_arena_index; + ed.affinity_slot = d1::no_slot; + ed.task_disp = this; + ed.wait_ctx = waiter.wait_ctx(); + + m_properties.outermost = false; + m_properties.fifo_tasks_allowed = false; + + t = get_critical_task(t, ed, isolation, critical_allowed); + if (t && m_thread_data->my_inbox.is_idle_state(true)) { + // The thread has a work to do. Therefore, marking its inbox as not idle so that + // affinitized tasks can be stolen from it. + m_thread_data->my_inbox.set_is_idle(false); + } + + // Infinite exception loop + for (;;) { + try { + // Main execution loop + do { + // We assume that bypass tasks are from the same task group. + context_guard.set_ctx(ed.context); + // Inner level evaluates tasks coming from nesting loops and those returned + // by just executed tasks (bypassing spawn or enqueue calls). + while (t != nullptr) { + assert_task_valid(t); + assert_pointer_valid(ed.context); + __TBB_ASSERT(ed.context->my_state == d1::task_group_context::state::bound || + ed.context->my_state == d1::task_group_context::state::isolated, nullptr); + __TBB_ASSERT(m_thread_data->my_inbox.is_idle_state(false), nullptr); + __TBB_ASSERT(task_accessor::is_resume_task(*t) || isolation == no_isolation || isolation == ed.isolation, nullptr); + // Check premature leave + if (Waiter::postpone_execution(*t)) { + __TBB_ASSERT(task_accessor::is_resume_task(*t) && dl_guard.old_properties.outermost, + "Currently, the bypass loop can be interrupted only for resume task on outermost level"); + return t; + } + // Copy itt_caller to a stack because the context might be destroyed after t->execute. + void* itt_caller = ed.context->my_itt_caller; + suppress_unused_warning(itt_caller); + + ITT_CALLEE_ENTER(ITTPossible, t, itt_caller); + + if (ed.context->is_group_execution_cancelled()) { + t = t->cancel(ed); + } else { + t = t->execute(ed); + } + + ITT_CALLEE_LEAVE(ITTPossible, itt_caller); + + // The task affinity in execution data is set for affinitized tasks. + // So drop it after the task execution. + ed.affinity_slot = d1::no_slot; + // Reset task owner id for bypassed task + ed.original_slot = m_thread_data->my_arena_index; + t = get_critical_task(t, ed, isolation, critical_allowed); + } + __TBB_ASSERT(m_thread_data && governor::is_thread_data_set(m_thread_data), nullptr); + __TBB_ASSERT(m_thread_data->my_task_dispatcher == this, nullptr); + // When refactoring, pay attention that m_thread_data can be changed after t->execute() + __TBB_ASSERT(m_thread_data->my_arena_slot != nullptr, nullptr); + arena_slot& slot = *m_thread_data->my_arena_slot; + if (!waiter.continue_execution(slot, t)) { + break; + } + // Retrieve the task from local task pool + if (t || (slot.is_task_pool_published() && (t = slot.get_task(ed, isolation)))) { + __TBB_ASSERT(ed.original_slot == m_thread_data->my_arena_index, nullptr); + ed.context = task_accessor::context(*t); + ed.isolation = task_accessor::isolation(*t); + continue; + } + // Retrieve the task from global sources + t = receive_or_steal_task( + *m_thread_data, ed, waiter, isolation, dl_guard.old_properties.fifo_tasks_allowed, + critical_allowed + ); + } while (t != nullptr); // main dispatch loop + break; // Exit exception loop; + } catch (...) { + if (global_control::active_value(global_control::terminate_on_exception) == 1) { + do_throw_noexcept([] { throw; }); + } + if (ed.context->cancel_group_execution()) { + /* We are the first to signal cancellation, so store the exception that caused it. */ + ed.context->my_exception.store(tbb_exception_ptr::allocate(), std::memory_order_release); + } + } + } // Infinite exception loop + __TBB_ASSERT(t == nullptr, nullptr); + + +#if __TBB_RESUMABLE_TASKS + if (dl_guard.old_properties.outermost) { + recall_point(); + } +#endif /* __TBB_RESUMABLE_TASKS */ + + return nullptr; +} + +#if __TBB_RESUMABLE_TASKS +inline void task_dispatcher::recall_point() { + if (this != &m_thread_data->my_arena_slot->default_task_dispatcher()) { + __TBB_ASSERT(m_suspend_point != nullptr, nullptr); + __TBB_ASSERT(m_suspend_point->m_is_owner_recalled.load(std::memory_order_relaxed) == false, nullptr); + + m_thread_data->set_post_resume_action(post_resume_action::notify, get_suspend_point()); + internal_suspend(); + + if (m_thread_data->my_inbox.is_idle_state(true)) { + m_thread_data->my_inbox.set_is_idle(false); + } + } +} +#endif /* __TBB_RESUMABLE_TASKS */ + +#if __TBB_PREVIEW_CRITICAL_TASKS +inline d1::task* task_dispatcher::get_critical_task(d1::task* t, execution_data_ext& ed, isolation_type isolation, bool critical_allowed) { + __TBB_ASSERT( critical_allowed || !m_properties.critical_task_allowed, nullptr ); + + if (!critical_allowed) { + // The stack is already in the process of critical path execution. Cannot take another + // critical work until finish with the current one. + __TBB_ASSERT(!m_properties.critical_task_allowed, nullptr); + return t; + } + + assert_pointers_valid(m_thread_data, m_thread_data->my_arena, m_thread_data->my_arena_slot); + thread_data& td = *m_thread_data; + arena& a = *td.my_arena; + arena_slot& slot = *td.my_arena_slot; + + d1::task* crit_t = a.get_critical_task(slot.hint_for_critical_stream, isolation); + if (crit_t != nullptr) { + assert_task_valid(crit_t); + if (t != nullptr) { + assert_pointer_valid(ed.context); + r1::spawn(*t, *ed.context); + } + ed.context = task_accessor::context(*crit_t); + ed.isolation = task_accessor::isolation(*crit_t); + + // We cannot execute more than one critical task on the same stack. + // In other words, we prevent nested critical tasks. + m_properties.critical_task_allowed = false; + + // TODO: add a test that the observer is called when critical task is taken. + a.my_observers.notify_entry_observers(td.my_last_observer, td.my_is_worker); + t = crit_t; + } else { + // Was unable to find critical work in the queue. Allow inspecting the queue in nested + // invocations. Handles the case when critical task has been just completed. + m_properties.critical_task_allowed = true; + } + return t; +} +#else +inline d1::task* task_dispatcher::get_critical_task(d1::task* t, execution_data_ext&, isolation_type, bool /*critical_allowed*/) { + return t; +} +#endif + +inline d1::task* task_dispatcher::get_mailbox_task(mail_inbox& my_inbox, execution_data_ext& ed, isolation_type isolation) { + while (task_proxy* const tp = my_inbox.pop(isolation)) { + if (d1::task* result = tp->extract_task()) { + ed.original_slot = (unsigned short)(-2); + ed.affinity_slot = ed.task_disp->m_thread_data->my_arena_index; + return result; + } + // We have exclusive access to the proxy, and can destroy it. + tp->allocator.delete_object(tp, ed); + } + return nullptr; +} + +template +d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter) { + if (governor::is_itt_present()) { + return local_wait_for_all(t, waiter); + } else { + return local_wait_for_all(t, waiter); + } +} + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_task_dispatcher_H + diff --git a/third_party/tbb/task_group.h b/third_party/tbb/task_group.h new file mode 100644 index 000000000..70d0bccd8 --- /dev/null +++ b/third_party/tbb/task_group.h @@ -0,0 +1,747 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_task_group_H +#define __TBB_task_group_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/detail/_assert.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_template_helpers.h" +#include "third_party/tbb/detail/_exception.h" +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/detail/_small_object_pool.h" +#include "third_party/tbb/detail/_intrusive_list_node.h" +#include "third_party/tbb/detail/_task_handle.h" + +#include "third_party/tbb/profiling.h" + +#include "third_party/libcxx/type_traits" + +#if _MSC_VER && !defined(__INTEL_COMPILER) + // Suppress warning: structure was padded due to alignment specifier + #pragma warning(push) + #pragma warning(disable:4324) +#endif + +namespace tbb { +namespace detail { + +namespace d1 { +class delegate_base; +class task_arena_base; +class task_group_context; +class task_group_base; +} + +namespace r1 { +// Forward declarations +class tbb_exception_ptr; +class cancellation_disseminator; +class thread_data; +class task_dispatcher; +template +class context_guard_helper; +struct task_arena_impl; +class context_list; + +TBB_EXPORT void __TBB_EXPORTED_FUNC execute(d1::task_arena_base&, d1::delegate_base&); +TBB_EXPORT void __TBB_EXPORTED_FUNC isolate_within_arena(d1::delegate_base&, std::intptr_t); + +TBB_EXPORT void __TBB_EXPORTED_FUNC initialize(d1::task_group_context&); +TBB_EXPORT void __TBB_EXPORTED_FUNC destroy(d1::task_group_context&); +TBB_EXPORT void __TBB_EXPORTED_FUNC reset(d1::task_group_context&); +TBB_EXPORT bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context&); +TBB_EXPORT bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context&); +TBB_EXPORT void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context&); + +struct task_group_context_impl; +} + +namespace d2 { + +namespace { +template +d1::task* task_ptr_or_nullptr(F&& f); +} + +template +class function_task : public task_handle_task { + //TODO: apply empty base optimization here + const F m_func; + +private: + d1::task* execute(d1::execution_data& ed) override { + __TBB_ASSERT(ed.context == &this->ctx(), "The task group context should be used for all tasks"); + task* res = task_ptr_or_nullptr(m_func); + finalize(&ed); + return res; + } + d1::task* cancel(d1::execution_data& ed) override { + finalize(&ed); + return nullptr; + } +public: + template + function_task(FF&& f, d1::wait_context& wo, d1::task_group_context& ctx, d1::small_object_allocator& alloc) + : task_handle_task{wo, ctx, alloc}, + m_func(std::forward(f)) {} +}; + +#if __TBB_PREVIEW_TASK_GROUP_EXTENSIONS +namespace { + template + d1::task* task_ptr_or_nullptr_impl(std::false_type, F&& f){ + task_handle th = std::forward(f)(); + return task_handle_accessor::release(th); + } + + template + d1::task* task_ptr_or_nullptr_impl(std::true_type, F&& f){ + std::forward(f)(); + return nullptr; + } + + template + d1::task* task_ptr_or_nullptr(F&& f){ + using is_void_t = std::is_void< + decltype(std::forward(f)()) + >; + + return task_ptr_or_nullptr_impl(is_void_t{}, std::forward(f)); + } +} +#else +namespace { + template + d1::task* task_ptr_or_nullptr(F&& f){ + std::forward(f)(); + return nullptr; + } +} // namespace +#endif // __TBB_PREVIEW_TASK_GROUP_EXTENSIONS +} // namespace d2 + +namespace d1 { + +// This structure is left here for backward compatibility check +struct context_list_node { + std::atomic prev{}; + std::atomic next{}; +}; + +//! Used to form groups of tasks +/** @ingroup task_scheduling + The context services explicit cancellation requests from user code, and unhandled + exceptions intercepted during tasks execution. Intercepting an exception results + in generating internal cancellation requests (which is processed in exactly the + same way as external ones). + + The context is associated with one or more root tasks and defines the cancellation + group that includes all the descendants of the corresponding root task(s). Association + is established when a context object is passed as an argument to the task::allocate_root() + method. See task_group_context::task_group_context for more details. + + The context can be bound to another one, and other contexts can be bound to it, + forming a tree-like structure: parent -> this -> children. Arrows here designate + cancellation propagation direction. If a task in a cancellation group is cancelled + all the other tasks in this group and groups bound to it (as children) get cancelled too. +**/ +class task_group_context : no_copy { +public: + enum traits_type { + fp_settings = 1 << 1, + concurrent_wait = 1 << 2, + default_traits = 0 + }; + enum kind_type { + isolated, + bound + }; +private: + //! Space for platform-specific FPU settings. + /** Must only be accessed inside TBB binaries, and never directly in user + code or inline methods. */ + std::uint64_t my_cpu_ctl_env; + + //! Specifies whether cancellation was requested for this task group. + std::atomic my_cancellation_requested; + + //! Versioning for run-time checks and behavioral traits of the context. + enum class task_group_context_version : std::uint8_t { + unused = 1 // ensure that new versions, if any, will not clash with previously used ones + }; + task_group_context_version my_version; + + //! The context traits. + struct context_traits { + bool fp_settings : 1; + bool concurrent_wait : 1; + bool bound : 1; + bool reserved1 : 1; + bool reserved2 : 1; + bool reserved3 : 1; + bool reserved4 : 1; + bool reserved5 : 1; + } my_traits; + + static_assert(sizeof(context_traits) == 1, "Traits shall fit into one byte."); + + static constexpr std::uint8_t may_have_children = 1; + //! The context internal state (currently only may_have_children). + std::atomic my_may_have_children; + + enum class state : std::uint8_t { + created, + locked, + isolated, + bound, + dead, + proxy = std::uint8_t(-1) //the context is not the real one, but proxy to other one + }; + + //! The synchronization machine state to manage lifetime. + std::atomic my_state; + + union { + //! Pointer to the context of the parent cancellation group. nullptr for isolated contexts. + task_group_context* my_parent; + + //! Pointer to the actual context 'this' context represents a proxy of. + task_group_context* my_actual_context; + }; + + //! Thread data instance that registered this context in its list. + r1::context_list* my_context_list; + static_assert(sizeof(std::atomic) == sizeof(r1::context_list*), "To preserve backward compatibility these types should have the same size"); + + //! Used to form the thread specific list of contexts without additional memory allocation. + /** A context is included into the list of the current thread when its binding to + its parent happens. Any context can be present in the list of one thread only. **/ + intrusive_list_node my_node; + static_assert(sizeof(intrusive_list_node) == sizeof(context_list_node), "To preserve backward compatibility these types should have the same size"); + + //! Pointer to the container storing exception being propagated across this task group. + std::atomic my_exception; + static_assert(sizeof(std::atomic) == sizeof(r1::tbb_exception_ptr*), + "backward compatibility check"); + + //! Used to set and maintain stack stitching point for Intel Performance Tools. + void* my_itt_caller; + + //! Description of algorithm for scheduler based instrumentation. + string_resource_index my_name; + + char padding[max_nfs_size + - sizeof(std::uint64_t) // my_cpu_ctl_env + - sizeof(std::atomic) // my_cancellation_requested + - sizeof(std::uint8_t) // my_version + - sizeof(context_traits) // my_traits + - sizeof(std::atomic) // my_state + - sizeof(std::atomic) // my_state + - sizeof(task_group_context*) // my_parent + - sizeof(r1::context_list*) // my_context_list + - sizeof(intrusive_list_node) // my_node + - sizeof(std::atomic) // my_exception + - sizeof(void*) // my_itt_caller + - sizeof(string_resource_index) // my_name + ]; + + task_group_context(context_traits t, string_resource_index name) + : my_version{task_group_context_version::unused}, my_name{name} + { + my_traits = t; // GCC4.8 issues warning list initialization for bitset (missing-field-initializers) + r1::initialize(*this); + } + + task_group_context(task_group_context* actual_context) + : my_version{task_group_context_version::unused} + , my_state{state::proxy} + , my_actual_context{actual_context} + { + __TBB_ASSERT(my_actual_context, "Passed pointer value points to nothing."); + my_name = actual_context->my_name; + + // no need to initialize 'this' context as it acts as a proxy for my_actual_context, which + // initialization is a user-side responsibility. + } + + static context_traits make_traits(kind_type relation_with_parent, std::uintptr_t user_traits) { + context_traits ct; + ct.fp_settings = (user_traits & fp_settings) == fp_settings; + ct.concurrent_wait = (user_traits & concurrent_wait) == concurrent_wait; + ct.bound = relation_with_parent == bound; + ct.reserved1 = ct.reserved2 = ct.reserved3 = ct.reserved4 = ct.reserved5 = false; + return ct; + } + + bool is_proxy() const { + return my_state.load(std::memory_order_relaxed) == state::proxy; + } + + task_group_context& actual_context() noexcept { + if (is_proxy()) { + __TBB_ASSERT(my_actual_context, "Actual task_group_context is not set."); + return *my_actual_context; + } + return *this; + } + + const task_group_context& actual_context() const noexcept { + if (is_proxy()) { + __TBB_ASSERT(my_actual_context, "Actual task_group_context is not set."); + return *my_actual_context; + } + return *this; + } + +public: + //! Default & binding constructor. + /** By default a bound context is created. That is this context will be bound + (as child) to the context of the currently executing task . Cancellation + requests passed to the parent context are propagated to all the contexts + bound to it. Similarly priority change is propagated from the parent context + to its children. + + If task_group_context::isolated is used as the argument, then the tasks associated + with this context will never be affected by events in any other context. + + Creating isolated contexts involve much less overhead, but they have limited + utility. Normally when an exception occurs in an algorithm that has nested + ones running, it is desirably to have all the nested algorithms cancelled + as well. Such a behavior requires nested algorithms to use bound contexts. + + There is one good place where using isolated algorithms is beneficial. It is + an external thread. That is if a particular algorithm is invoked directly from + the external thread (not from a TBB task), supplying it with explicitly + created isolated context will result in a faster algorithm startup. + + VERSIONING NOTE: + Implementation(s) of task_group_context constructor(s) cannot be made + entirely out-of-line because the run-time version must be set by the user + code. This will become critically important for binary compatibility, if + we ever have to change the size of the context object. **/ + + task_group_context(kind_type relation_with_parent = bound, + std::uintptr_t t = default_traits) + : task_group_context(make_traits(relation_with_parent, t), CUSTOM_CTX) {} + + // Custom constructor for instrumentation of oneTBB algorithm + task_group_context(string_resource_index name ) + : task_group_context(make_traits(bound, default_traits), name) {} + + // Do not introduce any logic on user side since it might break state propagation assumptions + ~task_group_context() { + // When 'this' serves as a proxy, the initialization does not happen - nor should the + // destruction. + if (!is_proxy()) + { + r1::destroy(*this); + } + } + + //! Forcefully reinitializes the context after the task tree it was associated with is completed. + /** Because the method assumes that all the tasks that used to be associated with + this context have already finished, calling it while the context is still + in use somewhere in the task hierarchy leads to undefined behavior. + + IMPORTANT: This method is not thread safe! + + The method does not change the context's parent if it is set. **/ + void reset() { + r1::reset(actual_context()); + } + + //! Initiates cancellation of all tasks in this cancellation group and its subordinate groups. + /** \return false if cancellation has already been requested, true otherwise. + + Note that canceling never fails. When false is returned, it just means that + another thread (or this one) has already sent cancellation request to this + context or to one of its ancestors (if this context is bound). It is guaranteed + that when this method is concurrently called on the same not yet cancelled + context, true will be returned by one and only one invocation. **/ + bool cancel_group_execution() { + return r1::cancel_group_execution(actual_context()); + } + + //! Returns true if the context received cancellation request. + bool is_group_execution_cancelled() { + return r1::is_group_execution_cancelled(actual_context()); + } + +#if __TBB_FP_CONTEXT + //! Captures the current FPU control settings to the context. + /** Because the method assumes that all the tasks that used to be associated with + this context have already finished, calling it while the context is still + in use somewhere in the task hierarchy leads to undefined behavior. + + IMPORTANT: This method is not thread safe! + + The method does not change the FPU control settings of the context's parent. **/ + void capture_fp_settings() { + r1::capture_fp_settings(actual_context()); + } +#endif + + //! Returns the user visible context trait + std::uintptr_t traits() const { + std::uintptr_t t{}; + const task_group_context& ctx = actual_context(); + t |= ctx.my_traits.fp_settings ? fp_settings : 0; + t |= ctx.my_traits.concurrent_wait ? concurrent_wait : 0; + return t; + } +private: + //// TODO: cleanup friends + friend class r1::cancellation_disseminator; + friend class r1::thread_data; + friend class r1::task_dispatcher; + template + friend class r1::context_guard_helper; + friend struct r1::task_arena_impl; + friend struct r1::task_group_context_impl; + friend class task_group_base; +}; // class task_group_context + +static_assert(sizeof(task_group_context) == 128, "Wrong size of task_group_context"); + +enum task_group_status { + not_complete, + complete, + canceled +}; + +class task_group; +class structured_task_group; +#if TBB_PREVIEW_ISOLATED_TASK_GROUP +class isolated_task_group; +#endif + +template +class function_task : public task { + const F m_func; + wait_context& m_wait_ctx; + small_object_allocator m_allocator; + + void finalize(const execution_data& ed) { + // Make a local reference not to access this after destruction. + wait_context& wo = m_wait_ctx; + // Copy allocator to the stack + auto allocator = m_allocator; + // Destroy user functor before release wait. + this->~function_task(); + wo.release(); + + allocator.deallocate(this, ed); + } + task* execute(execution_data& ed) override { + task* res = d2::task_ptr_or_nullptr(m_func); + finalize(ed); + return res; + } + task* cancel(execution_data& ed) override { + finalize(ed); + return nullptr; + } +public: + function_task(const F& f, wait_context& wo, small_object_allocator& alloc) + : m_func(f) + , m_wait_ctx(wo) + , m_allocator(alloc) {} + + function_task(F&& f, wait_context& wo, small_object_allocator& alloc) + : m_func(std::move(f)) + , m_wait_ctx(wo) + , m_allocator(alloc) {} +}; + +template +class function_stack_task : public task { + const F& m_func; + wait_context& m_wait_ctx; + + void finalize() { + m_wait_ctx.release(); + } + task* execute(execution_data&) override { + task* res = d2::task_ptr_or_nullptr(m_func); + finalize(); + return res; + } + task* cancel(execution_data&) override { + finalize(); + return nullptr; + } +public: + function_stack_task(const F& f, wait_context& wo) : m_func(f), m_wait_ctx(wo) {} +}; + +class task_group_base : no_copy { +protected: + wait_context m_wait_ctx; + task_group_context m_context; + + template + task_group_status internal_run_and_wait(const F& f) { + function_stack_task t{ f, m_wait_ctx }; + m_wait_ctx.reserve(); + bool cancellation_status = false; + try_call([&] { + execute_and_wait(t, context(), m_wait_ctx, context()); + }).on_completion([&] { + // TODO: the reset method is not thread-safe. Ensure the correct behavior. + cancellation_status = context().is_group_execution_cancelled(); + context().reset(); + }); + return cancellation_status ? canceled : complete; + } + + task_group_status internal_run_and_wait(d2::task_handle&& h) { + __TBB_ASSERT(h != nullptr, "Attempt to schedule empty task_handle"); + + using acs = d2::task_handle_accessor; + __TBB_ASSERT(&acs::ctx_of(h) == &context(), "Attempt to schedule task_handle into different task_group"); + + bool cancellation_status = false; + try_call([&] { + execute_and_wait(*acs::release(h), context(), m_wait_ctx, context()); + }).on_completion([&] { + // TODO: the reset method is not thread-safe. Ensure the correct behavior. + cancellation_status = context().is_group_execution_cancelled(); + context().reset(); + }); + return cancellation_status ? canceled : complete; + } + + template + task* prepare_task(F&& f) { + m_wait_ctx.reserve(); + small_object_allocator alloc{}; + return alloc.new_object::type>>(std::forward(f), m_wait_ctx, alloc); + } + + task_group_context& context() noexcept { + return m_context.actual_context(); + } + + template + d2::task_handle prepare_task_handle(F&& f) { + m_wait_ctx.reserve(); + small_object_allocator alloc{}; + using function_task_t = d2::function_task::type>; + d2::task_handle_task* function_task_p = alloc.new_object(std::forward(f), m_wait_ctx, context(), alloc); + + return d2::task_handle_accessor::construct(function_task_p); + } + +public: + task_group_base(uintptr_t traits = 0) + : m_wait_ctx(0) + , m_context(task_group_context::bound, task_group_context::default_traits | traits) + {} + + task_group_base(task_group_context& ctx) + : m_wait_ctx(0) + , m_context(&ctx) + {} + + ~task_group_base() noexcept(false) { + if (m_wait_ctx.continue_execution()) { +#if __TBB_CPP17_UNCAUGHT_EXCEPTIONS_PRESENT + bool stack_unwinding_in_progress = std::uncaught_exceptions() > 0; +#else + bool stack_unwinding_in_progress = std::uncaught_exception(); +#endif + // Always attempt to do proper cleanup to avoid inevitable memory corruption + // in case of missing wait (for the sake of better testability & debuggability) + if (!context().is_group_execution_cancelled()) + cancel(); + d1::wait(m_wait_ctx, context()); + if (!stack_unwinding_in_progress) + throw_exception(exception_id::missing_wait); + } + } + + task_group_status wait() { + bool cancellation_status = false; + try_call([&] { + d1::wait(m_wait_ctx, context()); + }).on_completion([&] { + // TODO: the reset method is not thread-safe. Ensure the correct behavior. + cancellation_status = m_context.is_group_execution_cancelled(); + context().reset(); + }); + return cancellation_status ? canceled : complete; + } + + void cancel() { + context().cancel_group_execution(); + } +}; // class task_group_base + +class task_group : public task_group_base { +public: + task_group() : task_group_base(task_group_context::concurrent_wait) {} + task_group(task_group_context& ctx) : task_group_base(ctx) {} + + template + void run(F&& f) { + spawn(*prepare_task(std::forward(f)), context()); + } + + void run(d2::task_handle&& h) { + __TBB_ASSERT(h != nullptr, "Attempt to schedule empty task_handle"); + + using acs = d2::task_handle_accessor; + __TBB_ASSERT(&acs::ctx_of(h) == &context(), "Attempt to schedule task_handle into different task_group"); + + spawn(*acs::release(h), context()); + } + + template + d2::task_handle defer(F&& f) { + return prepare_task_handle(std::forward(f)); + + } + + template + task_group_status run_and_wait(const F& f) { + return internal_run_and_wait(f); + } + + task_group_status run_and_wait(d2::task_handle&& h) { + return internal_run_and_wait(std::move(h)); + } +}; // class task_group + +#if TBB_PREVIEW_ISOLATED_TASK_GROUP +class spawn_delegate : public delegate_base { + task* task_to_spawn; + task_group_context& context; + bool operator()() const override { + spawn(*task_to_spawn, context); + return true; + } +public: + spawn_delegate(task* a_task, task_group_context& ctx) + : task_to_spawn(a_task), context(ctx) + {} +}; + +class wait_delegate : public delegate_base { + bool operator()() const override { + status = tg.wait(); + return true; + } +protected: + task_group& tg; + task_group_status& status; +public: + wait_delegate(task_group& a_group, task_group_status& tgs) + : tg(a_group), status(tgs) {} +}; + +template +class run_wait_delegate : public wait_delegate { + F& func; + bool operator()() const override { + status = tg.run_and_wait(func); + return true; + } +public: + run_wait_delegate(task_group& a_group, F& a_func, task_group_status& tgs) + : wait_delegate(a_group, tgs), func(a_func) {} +}; + +class isolated_task_group : public task_group { + intptr_t this_isolation() { + return reinterpret_cast(this); + } +public: + isolated_task_group() : task_group() {} + + isolated_task_group(task_group_context& ctx) : task_group(ctx) {} + + template + void run(F&& f) { + spawn_delegate sd(prepare_task(std::forward(f)), context()); + r1::isolate_within_arena(sd, this_isolation()); + } + + void run(d2::task_handle&& h) { + __TBB_ASSERT(h != nullptr, "Attempt to schedule empty task_handle"); + + using acs = d2::task_handle_accessor; + __TBB_ASSERT(&acs::ctx_of(h) == &context(), "Attempt to schedule task_handle into different task_group"); + + spawn_delegate sd(acs::release(h), context()); + r1::isolate_within_arena(sd, this_isolation()); + } + + template + task_group_status run_and_wait( const F& f ) { + task_group_status result = not_complete; + run_wait_delegate rwd(*this, f, result); + r1::isolate_within_arena(rwd, this_isolation()); + __TBB_ASSERT(result != not_complete, "premature exit from wait?"); + return result; + } + + task_group_status wait() { + task_group_status result = not_complete; + wait_delegate wd(*this, result); + r1::isolate_within_arena(wd, this_isolation()); + __TBB_ASSERT(result != not_complete, "premature exit from wait?"); + return result; + } +}; // class isolated_task_group +#endif // TBB_PREVIEW_ISOLATED_TASK_GROUP + +inline bool is_current_task_group_canceling() { + task_group_context* ctx = current_context(); + return ctx ? ctx->is_group_execution_cancelled() : false; +} + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::task_group_context; +using detail::d1::task_group; +#if TBB_PREVIEW_ISOLATED_TASK_GROUP +using detail::d1::isolated_task_group; +#endif + +using detail::d1::task_group_status; +using detail::d1::not_complete; +using detail::d1::complete; +using detail::d1::canceled; + +using detail::d1::is_current_task_group_canceling; +using detail::r1::missing_wait; + +using detail::d2::task_handle; +} + +} // namespace tbb + +#if _MSC_VER && !defined(__INTEL_COMPILER) + #pragma warning(pop) // 4324 warning +#endif + +#endif // __TBB_task_group_H diff --git a/third_party/tbb/task_group_context.cpp b/third_party/tbb/task_group_context.cpp new file mode 100644 index 000000000..4f91c54e0 --- /dev/null +++ b/third_party/tbb/task_group_context.cpp @@ -0,0 +1,359 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/tbb_allocator.h" +#include "third_party/tbb/task_group.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/thread_data.h" +#include "third_party/tbb/scheduler_common.h" +#include "third_party/tbb/itt_notify.h" +#include "third_party/tbb/task_dispatcher.h" + +#include "third_party/libcxx/type_traits" + +namespace tbb { +namespace detail { +namespace r1 { + +//------------------------------------------------------------------------ +// tbb_exception_ptr +//------------------------------------------------------------------------ +tbb_exception_ptr* tbb_exception_ptr::allocate() noexcept { + tbb_exception_ptr* eptr = (tbb_exception_ptr*)allocate_memory(sizeof(tbb_exception_ptr)); + return eptr ? new (eptr) tbb_exception_ptr(std::current_exception()) : nullptr; +} + +void tbb_exception_ptr::destroy() noexcept { + this->~tbb_exception_ptr(); + deallocate_memory(this); +} + +void tbb_exception_ptr::throw_self() { + if (governor::rethrow_exception_broken()) fix_broken_rethrow(); + std::rethrow_exception(my_ptr); +} + +//------------------------------------------------------------------------ +// task_group_context +//------------------------------------------------------------------------ + +void task_group_context_impl::destroy(d1::task_group_context& ctx) { + __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); + + if (ctx.my_context_list != nullptr) { + __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) == d1::task_group_context::state::bound, nullptr); + // The owner can be destroyed at any moment. Access the associate data with caution. + ctx.my_context_list->remove(ctx.my_node); + } + d1::cpu_ctl_env* ctl = reinterpret_cast(&ctx.my_cpu_ctl_env); +#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER + suppress_unused_warning(ctl); +#endif + ctl->~cpu_ctl_env(); + + auto exception = ctx.my_exception.load(std::memory_order_relaxed); + if (exception) { + exception->destroy(); + } + ITT_STACK_DESTROY(ctx.my_itt_caller); + + poison_pointer(ctx.my_parent); + poison_pointer(ctx.my_context_list); + poison_pointer(ctx.my_node.my_next_node); + poison_pointer(ctx.my_node.my_prev_node); + poison_pointer(ctx.my_exception); + poison_pointer(ctx.my_itt_caller); + + ctx.my_state.store(d1::task_group_context::state::dead, std::memory_order_release); +} + +void task_group_context_impl::initialize(d1::task_group_context& ctx) { + ITT_TASK_GROUP(&ctx, ctx.my_name, nullptr); + + ctx.my_node.my_next_node = &ctx.my_node; + ctx.my_node.my_prev_node = &ctx.my_node; + ctx.my_cpu_ctl_env = 0; + ctx.my_cancellation_requested = 0; + ctx.my_may_have_children.store(0, std::memory_order_relaxed); + // Set the created state to bound at the first usage. + ctx.my_state.store(d1::task_group_context::state::created, std::memory_order_relaxed); + ctx.my_parent = nullptr; + ctx.my_context_list = nullptr; + ctx.my_exception.store(nullptr, std::memory_order_relaxed); + ctx.my_itt_caller = nullptr; + + static_assert(sizeof(d1::cpu_ctl_env) <= sizeof(ctx.my_cpu_ctl_env), "FPU settings storage does not fit to uint64_t"); + d1::cpu_ctl_env* ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env; + if (ctx.my_traits.fp_settings) + ctl->get_env(); +} + +void task_group_context_impl::register_with(d1::task_group_context& ctx, thread_data* td) { + __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); + __TBB_ASSERT(td, nullptr); + ctx.my_context_list = td->my_context_list; + + ctx.my_context_list->push_front(ctx.my_node); +} + +void task_group_context_impl::bind_to_impl(d1::task_group_context& ctx, thread_data* td) { + __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); + __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) == d1::task_group_context::state::locked, "The context can be bound only under the lock."); + __TBB_ASSERT(!ctx.my_parent, "Parent is set before initial binding"); + + ctx.my_parent = td->my_task_dispatcher->m_execute_data_ext.context; + __TBB_ASSERT(ctx.my_parent, nullptr); + + // Inherit FPU settings only if the context has not captured FPU settings yet. + if (!ctx.my_traits.fp_settings) + copy_fp_settings(ctx, *ctx.my_parent); + + // Condition below prevents unnecessary thrashing parent context's cache line + if (ctx.my_parent->my_may_have_children.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) { + ctx.my_parent->my_may_have_children.store(d1::task_group_context::may_have_children, std::memory_order_relaxed); // full fence is below + } + if (ctx.my_parent->my_parent) { + // Even if this context were made accessible for state change propagation + // (by placing store_with_release(td->my_context_list_state.head.my_next, &ctx.my_node) + // above), it still could be missed if state propagation from a grand-ancestor + // was underway concurrently with binding. + // Speculative propagation from the parent together with epoch counters + // detecting possibility of such a race allow to avoid taking locks when + // there is no contention. + + // Acquire fence is necessary to prevent reordering subsequent speculative + // loads of parent state data out of the scope where epoch counters comparison + // can reliably validate it. + uintptr_t local_count_snapshot = ctx.my_parent->my_context_list->epoch.load(std::memory_order_acquire); + // Speculative propagation of parent's state. The speculation will be + // validated by the epoch counters check further on. + ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed); + register_with(ctx, td); // Issues full fence + + // If no state propagation was detected by the following condition, the above + // full fence guarantees that the parent had correct state during speculative + // propagation before the fence. Otherwise the propagation from parent is + // repeated under the lock. + if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) { + // Another thread may be propagating state change right now. So resort to lock. + context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex); + ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed); + } + } else { + register_with(ctx, td); // Issues full fence + // As we do not have grand-ancestors, concurrent state propagation (if any) + // may originate only from the parent context, and thus it is safe to directly + // copy the state from it. + ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed); + } +} + +void task_group_context_impl::bind_to(d1::task_group_context& ctx, thread_data* td) { + d1::task_group_context::state state = ctx.my_state.load(std::memory_order_acquire); + if (state <= d1::task_group_context::state::locked) { + if (state == d1::task_group_context::state::created && +#if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910 + ((std::atomic::type>&)ctx.my_state).compare_exchange_strong( + (typename std::underlying_type::type&)state, + (typename std::underlying_type::type)d1::task_group_context::state::locked) +#else + ctx.my_state.compare_exchange_strong(state, d1::task_group_context::state::locked) +#endif + ) { + // If we are in the outermost task dispatch loop of an external thread, then + // there is nothing to bind this context to, and we skip the binding part + // treating the context as isolated. + __TBB_ASSERT(td->my_task_dispatcher->m_execute_data_ext.context != nullptr, nullptr); + d1::task_group_context::state release_state{}; + if (td->my_task_dispatcher->m_execute_data_ext.context == td->my_arena->my_default_ctx || !ctx.my_traits.bound) { + if (!ctx.my_traits.fp_settings) { + copy_fp_settings(ctx, *td->my_arena->my_default_ctx); + } + release_state = d1::task_group_context::state::isolated; + } else { + bind_to_impl(ctx, td); + release_state = d1::task_group_context::state::bound; + } + ITT_STACK_CREATE(ctx.my_itt_caller); + ctx.my_state.store(release_state, std::memory_order_release); + } + spin_wait_while_eq(ctx.my_state, d1::task_group_context::state::locked); + } + __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) != d1::task_group_context::state::created, nullptr); + __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) != d1::task_group_context::state::locked, nullptr); +} + +void task_group_context_impl::propagate_task_group_state(d1::task_group_context& ctx, std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, std::uint32_t new_state) { + __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); + /* 1. if ((ctx.*mptr_state).load(std::memory_order_relaxed) == new_state): + Nothing to do, whether descending from "src" or not, so no need to scan. + Hopefully this happens often thanks to earlier invocations. + This optimization is enabled by LIFO order in the context lists: + - new contexts are bound to the beginning of lists; + - descendants are newer than ancestors; + - earlier invocations are therefore likely to "paint" long chains. + 2. if (&ctx != &src): + This clause is disjunct from the traversal below, which skips src entirely. + Note that src.*mptr_state is not necessarily still equal to new_state (another thread may have changed it again). + Such interference is probably not frequent enough to aim for optimisation by writing new_state again (to make the other thread back down). + Letting the other thread prevail may also be fairer. + */ + if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state && &ctx != &src) { + for (d1::task_group_context* ancestor = ctx.my_parent; ancestor != nullptr; ancestor = ancestor->my_parent) { + if (ancestor == &src) { + for (d1::task_group_context* c = &ctx; c != ancestor; c = c->my_parent) + (c->*mptr_state).store(new_state, std::memory_order_relaxed); + break; + } + } + } +} + +bool task_group_context_impl::cancel_group_execution(d1::task_group_context& ctx) { + __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); + __TBB_ASSERT(ctx.my_cancellation_requested.load(std::memory_order_relaxed) <= 1, "The cancellation state can be either 0 or 1"); + if (ctx.my_cancellation_requested.load(std::memory_order_relaxed) || ctx.my_cancellation_requested.exchange(1)) { + // This task group and any descendants have already been canceled. + // (A newly added descendant would inherit its parent's ctx.my_cancellation_requested, + // not missing out on any cancellation still being propagated, and a context cannot be uncanceled.) + return false; + } + governor::get_thread_data()->my_arena->my_threading_control->propagate_task_group_state(&d1::task_group_context::my_cancellation_requested, ctx, uint32_t(1)); + return true; +} + +bool task_group_context_impl::is_group_execution_cancelled(const d1::task_group_context& ctx) { + return ctx.my_cancellation_requested.load(std::memory_order_relaxed) != 0; +} + +// IMPORTANT: It is assumed that this method is not used concurrently! +void task_group_context_impl::reset(d1::task_group_context& ctx) { + __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); + //! TODO: Add assertion that this context does not have children + // No fences are necessary since this context can be accessed from another thread + // only after stealing happened (which means necessary fences were used). + + auto exception = ctx.my_exception.load(std::memory_order_relaxed); + if (exception) { + exception->destroy(); + ctx.my_exception.store(nullptr, std::memory_order_relaxed); + } + ctx.my_cancellation_requested = 0; +} + +// IMPORTANT: It is assumed that this method is not used concurrently! +void task_group_context_impl::capture_fp_settings(d1::task_group_context& ctx) { + __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); + //! TODO: Add assertion that this context does not have children + // No fences are necessary since this context can be accessed from another thread + // only after stealing happened (which means necessary fences were used). + d1::cpu_ctl_env* ctl = reinterpret_cast(&ctx.my_cpu_ctl_env); + if (!ctx.my_traits.fp_settings) { + ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env; + ctx.my_traits.fp_settings = true; + } + ctl->get_env(); +} + +void task_group_context_impl::copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src) { + __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); + __TBB_ASSERT(!ctx.my_traits.fp_settings, "The context already has FPU settings."); + __TBB_ASSERT(src.my_traits.fp_settings, "The source context does not have FPU settings."); + + const d1::cpu_ctl_env* src_ctl = reinterpret_cast(&src.my_cpu_ctl_env); + new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env(*src_ctl); + ctx.my_traits.fp_settings = true; +} + +/* + Comments: + +1. The premise of the cancellation support implementation is that cancellations are + not part of the hot path of the program execution. Therefore all changes in its + implementation in order to reduce the overhead of the cancellation control flow + should be done only in ways that do not increase overhead of the normal execution. + + In general, contexts are used by all threads and their descendants are created in + different threads as well. In order to minimize impact of the cross-thread tree + maintenance (first of all because of the synchronization), the tree of contexts + is split into pieces, each of which is handled by a single thread. Such pieces + are represented as lists of contexts, members of which are contexts that were + bound to their parents in the given thread. + + The context tree maintenance and cancellation propagation algorithms are designed + in such a manner that cross-thread access to a context list will take place only + when cancellation signal is sent (by user or when an exception happens), and + synchronization is necessary only then. Thus the normal execution flow (without + exceptions and cancellation) remains free from any synchronization done on + behalf of exception handling and cancellation support. + +2. Consider parallel cancellations at the different levels of the context tree: + + Ctx1 <- Cancelled by Thread1 |- Thread2 started processing + | | + Ctx2 |- Thread1 started processing + | T1 |- Thread2 finishes and syncs up local counters + Ctx3 <- Cancelled by Thread2 | + | |- Ctx5 is bound to Ctx2 + Ctx4 | + T2 |- Thread1 reaches Ctx2 + + Thread-propagator of each cancellation increments global counter. However the thread + propagating the cancellation from the outermost context (Thread1) may be the last + to finish. Which means that the local counters may be synchronized earlier (by Thread2, + at Time1) than it propagated cancellation into Ctx2 (at time Time2). If a new context + (Ctx5) is created and bound to Ctx2 between Time1 and Time2, checking its parent only + (Ctx2) may result in cancellation request being lost. + + This issue is solved by doing the whole propagation under the lock. + + If we need more concurrency while processing parallel cancellations, we could try + the following modification of the propagation algorithm: + + advance global counter and remember it + for each thread: + scan thread's list of contexts + for each thread: + sync up its local counter only if the global counter has not been changed + + However this version of the algorithm requires more analysis and verification. +*/ + +void __TBB_EXPORTED_FUNC initialize(d1::task_group_context& ctx) { + task_group_context_impl::initialize(ctx); +} +void __TBB_EXPORTED_FUNC destroy(d1::task_group_context& ctx) { + task_group_context_impl::destroy(ctx); +} +void __TBB_EXPORTED_FUNC reset(d1::task_group_context& ctx) { + task_group_context_impl::reset(ctx); +} +bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context& ctx) { + return task_group_context_impl::cancel_group_execution(ctx); +} +bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context& ctx) { + return task_group_context_impl::is_group_execution_cancelled(ctx); +} +void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context& ctx) { + task_group_context_impl::capture_fp_settings(ctx); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb + diff --git a/third_party/tbb/task_scheduler_observer.h b/third_party/tbb/task_scheduler_observer.h new file mode 100644 index 000000000..4c3d31e79 --- /dev/null +++ b/third_party/tbb/task_scheduler_observer.h @@ -0,0 +1,117 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_task_scheduler_observer_H +#define __TBB_task_scheduler_observer_H + +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/tbb/task_arena.h" +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { + +namespace d1 { +class task_scheduler_observer; +} + +namespace r1 { +class observer_proxy; +class observer_list; + +//! Enable or disable observation +/** For local observers the method can be used only when the current thread +has the task scheduler initialized or is attached to an arena. +Repeated calls with the same state are no-ops. **/ +TBB_EXPORT void __TBB_EXPORTED_FUNC observe(d1::task_scheduler_observer&, bool state = true); +} + +namespace d1 { +class task_scheduler_observer { + friend class r1::observer_proxy; + friend class r1::observer_list; + friend void r1::observe(d1::task_scheduler_observer&, bool); + + //! Pointer to the proxy holding this observer. + /** Observers are proxied by the scheduler to maintain persistent lists of them. **/ + std::atomic my_proxy{ nullptr }; + + //! Counter preventing the observer from being destroyed while in use by the scheduler. + /** Valid only when observation is on. **/ + std::atomic my_busy_count{ 0 }; + + //! Contains task_arena pointer + task_arena* my_task_arena{ nullptr }; +public: + //! Returns true if observation is enabled, false otherwise. + bool is_observing() const { return my_proxy.load(std::memory_order_relaxed) != nullptr; } + + //! Entry notification + /** Invoked from inside observe(true) call and whenever a worker enters the arena + this observer is associated with. If a thread is already in the arena when + the observer is activated, the entry notification is called before it + executes the first stolen task. **/ + virtual void on_scheduler_entry( bool /*is_worker*/ ) {} + + //! Exit notification + /** Invoked from inside observe(false) call and whenever a worker leaves the + arena this observer is associated with. **/ + virtual void on_scheduler_exit( bool /*is_worker*/ ) {} + + //! Construct local or global observer in inactive state (observation disabled). + /** For a local observer entry/exit notifications are invoked whenever a worker + thread joins/leaves the arena of the observer's owner thread. If a thread is + already in the arena when the observer is activated, the entry notification is + called before it executes the first stolen task. **/ + explicit task_scheduler_observer() = default; + + //! Construct local observer for a given arena in inactive state (observation disabled). + /** entry/exit notifications are invoked whenever a thread joins/leaves arena. + If a thread is already in the arena when the observer is activated, the entry notification + is called before it executes the first stolen task. **/ + explicit task_scheduler_observer(task_arena& a) : my_task_arena(&a) {} + + /** Destructor protects instance of the observer from concurrent notification. + It is recommended to disable observation before destructor of a derived class starts, + otherwise it can lead to concurrent notification callback on partly destroyed object **/ + virtual ~task_scheduler_observer() { + if (my_proxy.load(std::memory_order_acquire)) { + observe(false); + } + } + + //! Enable or disable observation + /** Warning: concurrent invocations of this method are not safe. + Repeated calls with the same state are no-ops. **/ + void observe(bool state = true) { + if( state && !my_proxy.load(std::memory_order_relaxed) ) { + __TBB_ASSERT( my_busy_count.load(std::memory_order_relaxed) == 0, "Inconsistent state of task_scheduler_observer instance"); + } + r1::observe(*this, state); + } +}; + +} // namespace d1 +} // namespace detail + +inline namespace v1 { + using detail::d1::task_scheduler_observer; +} +} // namespace tbb + + +#endif /* __TBB_task_scheduler_observer_H */ diff --git a/third_party/tbb/task_stream.h b/third_party/tbb/task_stream.h new file mode 100644 index 000000000..54d84446c --- /dev/null +++ b/third_party/tbb/task_stream.h @@ -0,0 +1,287 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_task_stream_H +#define _TBB_task_stream_H + +//! This file is a possible future replacement for the task_stream class implemented in +//! task_stream.h. It refactors the code and extends task_stream capabilities by moving lane +//! management during operations on caller side. Despite the fact that new implementation should not +//! affect performance of the original task stream, analysis on this subject was not made at the +//! time it was developed. In addition, it is not clearly seen at the moment that this container +//! would be suitable for critical tasks due to linear time complexity on its operations. + +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/cache_aligned_allocator.h" +#include "third_party/tbb/mutex.h" + +#include "third_party/tbb/scheduler_common.h" +#include "third_party/tbb/misc.h" // for FastRandom + +#include "third_party/libcxx/deque" +#include "third_party/libcxx/climits" +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace r1 { + +//! Essentially, this is just a pair of a queue and a mutex to protect the queue. +/** The reason std::pair is not used is that the code would look less clean + if field names were replaced with 'first' and 'second'. **/ +template< typename T, typename mutex_t > +struct alignas(max_nfs_size) queue_and_mutex { + typedef std::deque< T, cache_aligned_allocator > queue_base_t; + + queue_base_t my_queue{}; + mutex_t my_mutex{}; +}; + +using population_t = uintptr_t; +const population_t one = 1; + +inline void set_one_bit( std::atomic& dest, int pos ) { + __TBB_ASSERT( pos>=0, nullptr); + __TBB_ASSERT( pos& dest, int pos ) { + __TBB_ASSERT( pos>=0, nullptr); + __TBB_ASSERT( pos=0, nullptr); + __TBB_ASSERT( pos +class task_stream_accessor : no_copy { +protected: + using lane_t = queue_and_mutex ; + d1::task* get_item( lane_t::queue_base_t& queue ) { + d1::task* result = queue.front(); + queue.pop_front(); + return result; + } +}; + +template<> +class task_stream_accessor< back_nonnull_accessor > : no_copy { +protected: + using lane_t = queue_and_mutex ; + d1::task* get_item( lane_t::queue_base_t& queue ) { + d1::task* result = nullptr; + __TBB_ASSERT(!queue.empty(), nullptr); + // Isolated task can put zeros in queue see look_specific + do { + result = queue.back(); + queue.pop_back(); + } while ( !result && !queue.empty() ); + return result; + } +}; + +//! The container for "fairness-oriented" aka "enqueued" tasks. +template +class task_stream : public task_stream_accessor< accessor > { + using lane_t = typename task_stream_accessor::lane_t; + std::atomic population{}; + lane_t* lanes{nullptr}; + unsigned N{}; + +public: + task_stream() = default; + + void initialize( unsigned n_lanes ) { + const unsigned max_lanes = sizeof(population_t) * CHAR_BIT; + + N = n_lanes >= max_lanes ? max_lanes : n_lanes > 2 ? 1 << (tbb::detail::log2(n_lanes - 1) + 1) : 2; + __TBB_ASSERT( N == max_lanes || (N >= n_lanes && ((N - 1) & N) == 0), "number of lanes miscalculated" ); + __TBB_ASSERT( N <= sizeof(population_t) * CHAR_BIT, nullptr); + lanes = static_cast(cache_aligned_allocate(sizeof(lane_t) * N)); + for (unsigned i = 0; i < N; ++i) { + new (lanes + i) lane_t; + } + __TBB_ASSERT( !population.load(std::memory_order_relaxed), nullptr); + } + + ~task_stream() { + if (lanes) { + for (unsigned i = 0; i < N; ++i) { + lanes[i].~lane_t(); + } + cache_aligned_deallocate(lanes); + } + } + + //! Push a task into a lane. Lane selection is performed by passed functor. + template + void push(d1::task* source, const lane_selector_t& next_lane ) { + bool succeed = false; + unsigned lane = 0; + do { + lane = next_lane( /*out_of=*/N ); + __TBB_ASSERT( lane < N, "Incorrect lane index." ); + } while( ! (succeed = try_push( source, lane )) ); + } + + //! Try finding and popping a task using passed functor for lane selection. Last used lane is + //! updated inside lane selector. + template + d1::task* pop( const lane_selector_t& next_lane ) { + d1::task* popped = nullptr; + unsigned lane = 0; + for (atomic_backoff b; !empty() && !popped; b.pause()) { + lane = next_lane( /*out_of=*/N); + __TBB_ASSERT(lane < N, "Incorrect lane index."); + popped = try_pop(lane); + } + return popped; + } + + //! Try finding and popping a related task. + d1::task* pop_specific( unsigned& last_used_lane, isolation_type isolation ) { + d1::task* result = nullptr; + // Lane selection is round-robin in backward direction. + unsigned idx = last_used_lane & (N-1); + do { + if( is_bit_set( population.load(std::memory_order_relaxed), idx ) ) { + lane_t& lane = lanes[idx]; + mutex::scoped_lock lock; + if( lock.try_acquire(lane.my_mutex) && !lane.my_queue.empty() ) { + result = look_specific( lane.my_queue, isolation ); + if( lane.my_queue.empty() ) + clear_one_bit( population, idx ); + if( result ) + break; + } + } + idx=(idx-1)&(N-1); + } while( !empty() && idx != last_used_lane ); + last_used_lane = idx; + return result; + } + + //! Checks existence of a task. + bool empty() { + return !population.load(std::memory_order_relaxed); + } + +private: + //! Returns true on successful push, otherwise - false. + bool try_push(d1::task* source, unsigned lane_idx ) { + mutex::scoped_lock lock; + if( lock.try_acquire( lanes[lane_idx].my_mutex ) ) { + lanes[lane_idx].my_queue.push_back( source ); + set_one_bit( population, lane_idx ); // TODO: avoid atomic op if the bit is already set + return true; + } + return false; + } + + //! Returns pointer to task on successful pop, otherwise - nullptr. + d1::task* try_pop( unsigned lane_idx ) { + if( !is_bit_set( population.load(std::memory_order_relaxed), lane_idx ) ) + return nullptr; + d1::task* result = nullptr; + lane_t& lane = lanes[lane_idx]; + mutex::scoped_lock lock; + if( lock.try_acquire( lane.my_mutex ) && !lane.my_queue.empty() ) { + result = this->get_item( lane.my_queue ); + if( lane.my_queue.empty() ) + clear_one_bit( population, lane_idx ); + } + return result; + } + + // TODO: unify '*_specific' logic with 'pop' methods above + d1::task* look_specific( typename lane_t::queue_base_t& queue, isolation_type isolation ) { + __TBB_ASSERT( !queue.empty(), nullptr); + // TODO: add a worst-case performance test and consider an alternative container with better + // performance for isolation search. + typename lane_t::queue_base_t::iterator curr = queue.end(); + do { + // TODO: consider logic from get_task to simplify the code. + d1::task* result = *--curr; + if( result && task_accessor::isolation(*result) == isolation ) { + if( queue.end() - curr == 1 ) + queue.pop_back(); // a little of housekeeping along the way + else + *curr = nullptr; // grabbing task with the same isolation + // TODO: move one of the container's ends instead if the task has been found there + return result; + } + } while( curr != queue.begin() ); + return nullptr; + } + +}; // task_stream + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* _TBB_task_stream_H */ diff --git a/third_party/tbb/tbb.h b/third_party/tbb/tbb.h new file mode 100644 index 000000000..f83d7791f --- /dev/null +++ b/third_party/tbb/tbb.h @@ -0,0 +1,75 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_tbb_H +#define __TBB_tbb_H + +/** + This header bulk-includes declarations or definitions of all the functionality + provided by TBB (save for tbbmalloc and 3rd party dependent headers). + + If you use only a few TBB constructs, consider including specific headers only. + Any header listed below can be included independently of others. +**/ + +// MISSING #include "oneapi/tbb/blocked_range.h" +// MISSING #include "oneapi/tbb/blocked_range2d.h" +// MISSING #include "oneapi/tbb/blocked_range3d.h" +#if TBB_PREVIEW_BLOCKED_RANGE_ND +// MISSING #include "tbb/blocked_rangeNd.h" +#endif +// MISSING #include "oneapi/tbb/cache_aligned_allocator.h" +// MISSING #include "oneapi/tbb/combinable.h" +// MISSING #include "oneapi/tbb/concurrent_hash_map.h" +#if TBB_PREVIEW_CONCURRENT_LRU_CACHE +// MISSING #include "tbb/concurrent_lru_cache.h" +#endif +// MISSING #include "oneapi/tbb/collaborative_call_once.h" +// MISSING #include "oneapi/tbb/concurrent_priority_queue.h" +// MISSING #include "oneapi/tbb/concurrent_queue.h" +// MISSING #include "oneapi/tbb/concurrent_unordered_map.h" +// MISSING #include "oneapi/tbb/concurrent_unordered_set.h" +// MISSING #include "oneapi/tbb/concurrent_map.h" +// MISSING #include "oneapi/tbb/concurrent_set.h" +// MISSING #include "oneapi/tbb/concurrent_vector.h" +// MISSING #include "oneapi/tbb/enumerable_thread_specific.h" +// MISSING #include "oneapi/tbb/flow_graph.h" +// MISSING #include "oneapi/tbb/global_control.h" +// MISSING #include "oneapi/tbb/info.h" +// MISSING #include "oneapi/tbb/null_mutex.h" +// MISSING #include "oneapi/tbb/null_rw_mutex.h" +// MISSING #include "oneapi/tbb/parallel_for.h" +// MISSING #include "oneapi/tbb/parallel_for_each.h" +// MISSING #include "oneapi/tbb/parallel_invoke.h" +// MISSING #include "oneapi/tbb/parallel_pipeline.h" +// MISSING #include "oneapi/tbb/parallel_reduce.h" +// MISSING #include "oneapi/tbb/parallel_scan.h" +// MISSING #include "oneapi/tbb/parallel_sort.h" +// MISSING #include "oneapi/tbb/partitioner.h" +// MISSING #include "oneapi/tbb/queuing_mutex.h" +// MISSING #include "oneapi/tbb/queuing_rw_mutex.h" +// MISSING #include "oneapi/tbb/spin_mutex.h" +// MISSING #include "oneapi/tbb/spin_rw_mutex.h" +// MISSING #include "oneapi/tbb/task.h" +// MISSING #include "oneapi/tbb/task_arena.h" +// MISSING #include "oneapi/tbb/task_group.h" +// MISSING #include "oneapi/tbb/task_scheduler_observer.h" +// MISSING #include "oneapi/tbb/tbb_allocator.h" +// MISSING #include "oneapi/tbb/tick_count.h" +// MISSING #include "oneapi/tbb/version.h" + +#endif /* __TBB_tbb_H */ diff --git a/third_party/tbb/tbb.mk b/third_party/tbb/tbb.mk new file mode 100644 index 000000000..b565bf3d8 --- /dev/null +++ b/third_party/tbb/tbb.mk @@ -0,0 +1,43 @@ +#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐ +#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘ + +PKGS += THIRD_PARTY_TBB + +THIRD_PARTY_TBB_ARTIFACTS += THIRD_PARTY_TBB_A +THIRD_PARTY_TBB = $(THIRD_PARTY_TBB_A_DEPS) $(THIRD_PARTY_TBB_A) +THIRD_PARTY_TBB_A = o/$(MODE)/third_party/tbb/tbb.a +THIRD_PARTY_TBB_FILES := $(wildcard third_party/tbb/*) $(wildcard third_party/tbb/detail/*) +THIRD_PARTY_TBB_HDRS = $(filter %.h,$(THIRD_PARTY_TBB_FILES)) +THIRD_PARTY_TBB_SRCS = $(filter %.cpp,$(THIRD_PARTY_TBB_FILES)) +THIRD_PARTY_TBB_OBJS = $(THIRD_PARTY_TBB_SRCS:%.cpp=o/$(MODE)/%.o) + +# Use this to debug +# $(info $$THIRD_PARTY_TBB_HDRS is [${THIRD_PARTY_TBB_HDRS}]) + +THIRD_PARTY_TBB_CHECKS = \ + $(THIRD_PARTY_TBB_A).pkg \ + $(THIRD_PARTY_TBB_HDRS:%=o/$(MODE)/%.ok) + +THIRD_PARTY_TBB_A_DIRECTDEPS = \ + THIRD_PARTY_LIBCXX + +THIRD_PARTY_TBB_A_DEPS := \ + $(call uniq,$(foreach x,$(THIRD_PARTY_TBB_A_DIRECTDEPS),$($(x)))) + +$(THIRD_PARTY_TBB_A): \ + third_party/tbb/ \ + $(THIRD_PARTY_TBB_A).pkg \ + $(THIRD_PARTY_TBB_OBJS) + +$(THIRD_PARTY_TBB_A).pkg: \ + $(THIRD_PARTY_TBB_OBJS) \ + $(foreach x,$(THIRD_PARTY_TBB_A_DIRECTDEPS),$($(x)_A).pkg) + +THIRD_PARTY_TBB_LIBS = $(THIRD_PARTY_TBB_A) + +$(THIRD_PARTY_TBB_OBJS): $(BUILD_FILES) third_party/tbb/tbb.mk + +.PHONY: o/$(MODE)/third_party/tbb +o/$(MODE)/third_party/tbb: \ + $(THIRD_PARTY_TBB_CHECKS) \ + $(THIRD_PARTY_TBB_A) diff --git a/third_party/tbb/tbb.rc b/third_party/tbb/tbb.rc new file mode 100644 index 000000000..a60744cfd --- /dev/null +++ b/third_party/tbb/tbb.rc @@ -0,0 +1,75 @@ +// clang-format off +// Copyright (c) 2005-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///////////////////////////////////////////////////////////////////////////// +// +// Includes +// +// MISSING #include +// MISSING #include "../../include/oneapi/tbb/version.h" + +///////////////////////////////////////////////////////////////////////////// +// Neutral resources + +#ifdef _WIN32 +LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL +#pragma code_page(1252) +#endif //_WIN32 + +///////////////////////////////////////////////////////////////////////////// +// +// Version +// +#define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR,TBB_VERSION_PATCH +#define TBB_VERSION TBB_VERSION_STRING + +VS_VERSION_INFO VERSIONINFO + FILEVERSION TBB_VERNUMBERS + PRODUCTVERSION TBB_VERNUMBERS + FILEFLAGSMASK 0x17L +#ifdef _DEBUG + FILEFLAGS 0x1L +#else + FILEFLAGS 0x0L +#endif + FILEOS 0x40004L + FILETYPE 0x2L + FILESUBTYPE 0x0L +BEGIN + BLOCK "StringFileInfo" + BEGIN + BLOCK "000004b0" + BEGIN + VALUE "CompanyName", "Intel Corporation\0" + VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" + VALUE "FileVersion", TBB_VERSION "\0" + VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation. All Rights Reserved.\0" + VALUE "LegalTrademarks", "\0" +#ifndef TBB_USE_DEBUG + VALUE "OriginalFilename", "tbb12.dll\0" +#else + VALUE "OriginalFilename", "tbb12_debug.dll\0" +#endif + VALUE "ProductName", "oneAPI Threading Building Blocks (oneTBB)\0" + VALUE "ProductVersion", TBB_VERSION "\0" + VALUE "PrivateBuild", "\0" + VALUE "SpecialBuild", "\0" + END + END + BLOCK "VarFileInfo" + BEGIN + VALUE "Translation", 0x0, 1200 + END +END diff --git a/third_party/tbb/tbb_allocator.h b/third_party/tbb/tbb_allocator.h new file mode 100644 index 000000000..0284dfb89 --- /dev/null +++ b/third_party/tbb/tbb_allocator.h @@ -0,0 +1,127 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_tbb_allocator_H +#define __TBB_tbb_allocator_H + +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/detail/_namespace_injection.h" +#include "third_party/libcxx/cstdlib" +#include "third_party/libcxx/utility" + +#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT +// MISSING #include +#endif + +namespace tbb { +namespace detail { + +namespace r1 { +TBB_EXPORT void* __TBB_EXPORTED_FUNC allocate_memory(std::size_t size); +TBB_EXPORT void __TBB_EXPORTED_FUNC deallocate_memory(void* p); +TBB_EXPORT bool __TBB_EXPORTED_FUNC is_tbbmalloc_used(); +} + +namespace d1 { + +template +class tbb_allocator { +public: + using value_type = T; + using propagate_on_container_move_assignment = std::true_type; + + //! Always defined for TBB containers (supported since C++17 for std containers) + using is_always_equal = std::true_type; + + //! Specifies current allocator + enum malloc_type { + scalable, + standard + }; + + tbb_allocator() = default; + template tbb_allocator(const tbb_allocator&) noexcept {} + + //! Allocate space for n objects. + __TBB_nodiscard T* allocate(std::size_t n) { + return static_cast(r1::allocate_memory(n * sizeof(value_type))); + } + + //! Free previously allocated block of memory. + void deallocate(T* p, std::size_t) { + r1::deallocate_memory(p); + } + + //! Returns current allocator + static malloc_type allocator_type() { + return r1::is_tbbmalloc_used() ? standard : scalable; + } + +#if TBB_ALLOCATOR_TRAITS_BROKEN + using pointer = value_type*; + using const_pointer = const value_type*; + using reference = value_type&; + using const_reference = const value_type&; + using difference_type = std::ptrdiff_t; + using size_type = std::size_t; + template struct rebind { + using other = tbb_allocator; + }; + //! Largest value for which method allocate might succeed. + size_type max_size() const noexcept { + size_type max = ~(std::size_t(0)) / sizeof(value_type); + return (max > 0 ? max : 1); + } + template + void construct(U *p, Args&&... args) + { ::new (p) U(std::forward(args)...); } + void destroy( pointer p ) { p->~value_type(); } + pointer address(reference x) const { return &x; } + const_pointer address(const_reference x) const { return &x; } +#endif // TBB_ALLOCATOR_TRAITS_BROKEN +}; + +#if TBB_ALLOCATOR_TRAITS_BROKEN + template<> + class tbb_allocator { + public: + using pointer = void*; + using const_pointer = const void*; + using value_type = void; + template struct rebind { + using other = tbb_allocator; + }; + }; +#endif + +template +inline bool operator==(const tbb_allocator&, const tbb_allocator&) noexcept { return true; } + +#if !__TBB_CPP20_COMPARISONS_PRESENT +template +inline bool operator!=(const tbb_allocator&, const tbb_allocator&) noexcept { return false; } +#endif + +} // namespace d1 +} // namespace detail + +inline namespace v1 { +using detail::d1::tbb_allocator; +} // namespace v1 +} // namespace tbb + +#endif /* __TBB_tbb_allocator_H */ diff --git a/third_party/tbb/tbbmalloc_proxy.h b/third_party/tbb/tbbmalloc_proxy.h new file mode 100644 index 000000000..cf262a207 --- /dev/null +++ b/third_party/tbb/tbbmalloc_proxy.h @@ -0,0 +1,66 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +/* +Replacing the standard memory allocation routines in Microsoft* C/C++ RTL +(malloc/free, global new/delete, etc.) with the TBB memory allocator. + +Include the following header to a source of any binary which is loaded during +application startup + +// MISSING #include "oneapi/tbb/tbbmalloc_proxy.h" + +or add following parameters to the linker options for the binary which is +loaded during application startup. It can be either exe-file or dll. + +For win32 +tbbmalloc_proxy.lib /INCLUDE:"___TBB_malloc_proxy" +win64 +tbbmalloc_proxy.lib /INCLUDE:"__TBB_malloc_proxy" +*/ + +#ifndef __TBB_tbbmalloc_proxy_H +#define __TBB_tbbmalloc_proxy_H + +#if _MSC_VER + +#ifdef _DEBUG + #pragma comment(lib, "tbbmalloc_proxy_debug.lib") +#else + #pragma comment(lib, "tbbmalloc_proxy.lib") +#endif + +#if defined(_WIN64) + #pragma comment(linker, "/include:__TBB_malloc_proxy") +#else + #pragma comment(linker, "/include:___TBB_malloc_proxy") +#endif + +#else +/* Primarily to support MinGW */ + +extern "C" void __TBB_malloc_proxy(); +struct __TBB_malloc_proxy_caller { + __TBB_malloc_proxy_caller() { __TBB_malloc_proxy(); } +} volatile __TBB_malloc_proxy_helper_object; + +#endif // _MSC_VER + +/* Public Windows API */ +extern "C" int TBB_malloc_replacement_log(char *** function_replacement_log_ptr); + +#endif //__TBB_tbbmalloc_proxy_H diff --git a/third_party/tbb/thread_control_monitor.h b/third_party/tbb/thread_control_monitor.h new file mode 100644 index 000000000..06e2755af --- /dev/null +++ b/third_party/tbb/thread_control_monitor.h @@ -0,0 +1,117 @@ +// clang-format off +/* + Copyright (c) 2021-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_thread_control_monitor_H +#define __TBB_thread_control_monitor_H + +#include "third_party/tbb/concurrent_monitor.h" +#include "third_party/tbb/scheduler_common.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace r1 { + +struct market_context { + market_context() = default; + + market_context(std::uintptr_t first_addr, arena* a) : + my_uniq_addr(first_addr), my_arena_addr(a) + {} + + std::uintptr_t my_uniq_addr{0}; + arena* my_arena_addr{nullptr}; +}; + +#if __TBB_RESUMABLE_TASKS +class resume_node : public wait_node { + using base_type = wait_node; +public: + resume_node(market_context ctx, execution_data_ext& ed_ext, task_dispatcher& target) + : base_type(ctx), my_curr_dispatcher(ed_ext.task_disp), my_target_dispatcher(&target) + , my_suspend_point(my_curr_dispatcher->get_suspend_point()) + {} + + ~resume_node() override { + if (this->my_skipped_wakeup) { + spin_wait_until_eq(this->my_notify_calls, 1); + } + + poison_pointer(my_curr_dispatcher); + poison_pointer(my_target_dispatcher); + poison_pointer(my_suspend_point); + } + + void init() override { + base_type::init(); + } + + void wait() override { + my_curr_dispatcher->resume(*my_target_dispatcher); + __TBB_ASSERT(!this->my_is_in_list.load(std::memory_order_relaxed), "Still in the queue?"); + } + + void reset() override { + base_type::reset(); + spin_wait_until_eq(this->my_notify_calls, 1); + my_notify_calls.store(0, std::memory_order_relaxed); + } + + // notify is called (perhaps, concurrently) twice from: + // - concurrent_monitor::notify + // - post_resume_action::register_waiter + // The second notify is called after thread switches the stack + // (Because we can not call resume while the stack is occupied) + // We need calling resume only when both notifications are performed. + void notify() override { + if (++my_notify_calls == 2) { + r1::resume(my_suspend_point); + } + } + +private: + friend class thread_data; + friend struct suspend_point_type::resume_task; + task_dispatcher* my_curr_dispatcher; + task_dispatcher* my_target_dispatcher; + suspend_point_type* my_suspend_point; + std::atomic my_notify_calls{0}; +}; +#endif // __TBB_RESUMABLE_TASKS + +class thread_control_monitor : public concurrent_monitor_base { + using base_type = concurrent_monitor_base; +public: + using base_type::base_type; + + ~thread_control_monitor() { + destroy(); + } + + /** per-thread descriptor for concurrent_monitor */ + using thread_context = sleep_node; +#if __TBB_RESUMABLE_TASKS + using resume_context = resume_node; +#endif +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_thread_control_monitor_H diff --git a/third_party/tbb/thread_data.h b/third_party/tbb/thread_data.h new file mode 100644 index 000000000..638e87e2f --- /dev/null +++ b/third_party/tbb/thread_data.h @@ -0,0 +1,260 @@ +// clang-format off +/* + Copyright (c) 2020-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_thread_data_H +#define __TBB_thread_data_H + +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/task.h" + +#include "third_party/tbb/rml_base.h" // rml::job + +#include "third_party/tbb/scheduler_common.h" +#include "third_party/tbb/arena.h" +#include "third_party/tbb/concurrent_monitor.h" +#include "third_party/tbb/mailbox.h" +#include "third_party/tbb/misc.h" // FastRandom +#include "third_party/tbb/small_object_pool_impl.h" +#include "third_party/tbb/intrusive_list.h" + +#include "third_party/libcxx/atomic" + +namespace tbb { +namespace detail { +namespace r1 { + +class task; +class arena_slot; +class task_group_context; +class task_dispatcher; +class thread_dispatcher_client; + +class context_list : public intrusive_list { +public: + bool orphaned{false}; + + //! Last state propagation epoch known to this thread + /** Together with the_context_state_propagation_epoch constitute synchronization protocol + that keeps hot path of task group context construction destruction mostly + lock-free. + When local epoch equals the global one, the state of task group contexts + registered with this thread is consistent with that of the task group trees + they belong to. **/ + std::atomic epoch{}; + + //! Mutex protecting access to the list of task group contexts. + d1::mutex m_mutex{}; + + void destroy() { + this->~context_list(); + cache_aligned_deallocate(this); + } + + void remove(d1::intrusive_list_node& val) { + mutex::scoped_lock lock(m_mutex); + + intrusive_list::remove(val); + + if (orphaned && empty()) { + lock.release(); + destroy(); + } + } + + void push_front(d1::intrusive_list_node& val) { + mutex::scoped_lock lock(m_mutex); + + intrusive_list::push_front(val); + } + + void orphan() { + mutex::scoped_lock lock(m_mutex); + + orphaned = true; + if (empty()) { + lock.release(); + destroy(); + } + } +}; + +//------------------------------------------------------------------------ +// Thread Data +//------------------------------------------------------------------------ +class thread_data : public ::rml::job + , public d1::intrusive_list_node + , no_copy { +public: + thread_data(unsigned short index, bool is_worker) + : my_arena_index{ index } + , my_is_worker{ is_worker } + , my_task_dispatcher{ nullptr } + , my_arena{ nullptr } + , my_last_client{ nullptr } + , my_arena_slot{} + , my_random{ this } + , my_last_observer{ nullptr } + , my_small_object_pool{new (cache_aligned_allocate(sizeof(small_object_pool_impl))) small_object_pool_impl{}} + , my_context_list(new (cache_aligned_allocate(sizeof(context_list))) context_list{}) +#if __TBB_RESUMABLE_TASKS + , my_post_resume_action{ task_dispatcher::post_resume_action::none } + , my_post_resume_arg{nullptr} +#endif /* __TBB_RESUMABLE_TASKS */ + { + ITT_SYNC_CREATE(&my_context_list->m_mutex, SyncType_Scheduler, SyncObj_ContextsList); + } + + ~thread_data() { + my_context_list->orphan(); + my_small_object_pool->destroy(); + poison_pointer(my_task_dispatcher); + poison_pointer(my_arena); + poison_pointer(my_arena_slot); + poison_pointer(my_last_observer); + poison_pointer(my_small_object_pool); + poison_pointer(my_context_list); +#if __TBB_RESUMABLE_TASKS + poison_pointer(my_post_resume_arg); +#endif /* __TBB_RESUMABLE_TASKS */ + } + + void attach_arena(arena& a, std::size_t index); + bool is_attached_to(arena*); + void attach_task_dispatcher(task_dispatcher&); + void detach_task_dispatcher(); + void enter_task_dispatcher(task_dispatcher& task_disp, std::uintptr_t stealing_threshold); + void leave_task_dispatcher(); + void propagate_task_group_state(std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, uint32_t new_state); + + //! Index of the arena slot the scheduler occupies now, or occupied last time + unsigned short my_arena_index; + + //! Indicates if the thread is created by RML + const bool my_is_worker; + + //! The current task dipsatcher + task_dispatcher* my_task_dispatcher; + + //! The arena that I own (if external thread) or am servicing at the moment (if worker) + arena* my_arena; + + thread_dispatcher_client* my_last_client; + + //! Pointer to the slot in the arena we own at the moment + arena_slot* my_arena_slot; + + //! The mailbox (affinity mechanism) the current thread attached to + mail_inbox my_inbox; + + //! The random generator + FastRandom my_random; + + //! Last observer in the observers list processed on this slot + observer_proxy* my_last_observer; + + //! Pool of small object for fast task allocation + small_object_pool_impl* my_small_object_pool; + + context_list* my_context_list; +#if __TBB_RESUMABLE_TASKS + //! Suspends the current coroutine (task_dispatcher). + void suspend(void* suspend_callback, void* user_callback); + + //! Resumes the target task_dispatcher. + void resume(task_dispatcher& target); + + //! Set post resume action to perform after resume. + void set_post_resume_action(task_dispatcher::post_resume_action pra, void* arg) { + __TBB_ASSERT(my_post_resume_action == task_dispatcher::post_resume_action::none, "The Post resume action must not be set"); + __TBB_ASSERT(!my_post_resume_arg, "The post resume action must not have an argument"); + my_post_resume_action = pra; + my_post_resume_arg = arg; + } + + void clear_post_resume_action() { + my_post_resume_action = task_dispatcher::post_resume_action::none; + my_post_resume_arg = nullptr; + } + + //! The post resume action requested after the swap contexts. + task_dispatcher::post_resume_action my_post_resume_action; + + //! The post resume action argument. + void* my_post_resume_arg; +#endif /* __TBB_RESUMABLE_TASKS */ + + //! The default context + // TODO: consider using common default context because it is used only to simplify + // cancellation check. + d1::task_group_context my_default_context; +}; + +inline void thread_data::attach_arena(arena& a, std::size_t index) { + my_arena = &a; + my_arena_index = static_cast(index); + my_arena_slot = a.my_slots + index; + // Read the current slot mail_outbox and attach it to the mail_inbox (remove inbox later maybe) + my_inbox.attach(my_arena->mailbox(index)); +} + +inline bool thread_data::is_attached_to(arena* a) { return my_arena == a; } + +inline void thread_data::attach_task_dispatcher(task_dispatcher& task_disp) { + __TBB_ASSERT(my_task_dispatcher == nullptr, nullptr); + __TBB_ASSERT(task_disp.m_thread_data == nullptr, nullptr); + task_disp.m_thread_data = this; + my_task_dispatcher = &task_disp; +} + +inline void thread_data::detach_task_dispatcher() { + __TBB_ASSERT(my_task_dispatcher != nullptr, nullptr); + __TBB_ASSERT(my_task_dispatcher->m_thread_data == this, nullptr); + my_task_dispatcher->m_thread_data = nullptr; + my_task_dispatcher = nullptr; +} + +inline void thread_data::enter_task_dispatcher(task_dispatcher& task_disp, std::uintptr_t stealing_threshold) { + task_disp.set_stealing_threshold(stealing_threshold); + attach_task_dispatcher(task_disp); +} + +inline void thread_data::leave_task_dispatcher() { + my_task_dispatcher->set_stealing_threshold(0); + detach_task_dispatcher(); +} + +inline void thread_data::propagate_task_group_state(std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, std::uint32_t new_state) { + mutex::scoped_lock lock(my_context_list->m_mutex); + // Acquire fence is necessary to ensure that the subsequent node->my_next load + // returned the correct value in case it was just inserted in another thread. + // The fence also ensures visibility of the correct ctx.my_parent value. + for (context_list::iterator it = my_context_list->begin(); it != my_context_list->end(); ++it) { + d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, &(*it)); + if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state) + task_group_context_impl::propagate_task_group_state(ctx, mptr_state, src, new_state); + } + // Sync up local propagation epoch with the global one. Release fence prevents + // reordering of possible store to *mptr_state after the sync point. + my_context_list->epoch.store(the_context_state_propagation_epoch.load(std::memory_order_relaxed), std::memory_order_release); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_thread_data_H + diff --git a/third_party/tbb/thread_dispatcher.cpp b/third_party/tbb/thread_dispatcher.cpp new file mode 100644 index 000000000..6562d8d10 --- /dev/null +++ b/third_party/tbb/thread_dispatcher.cpp @@ -0,0 +1,225 @@ +// clang-format off +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/thread_dispatcher.h" +#include "third_party/tbb/threading_control.h" + +namespace tbb { +namespace detail { +namespace r1 { + +thread_dispatcher::thread_dispatcher(threading_control& tc, unsigned hard_limit, std::size_t stack_size) + : my_threading_control(tc) + , my_num_workers_hard_limit(hard_limit) + , my_stack_size(stack_size) +{ + my_server = governor::create_rml_server( *this ); + __TBB_ASSERT( my_server, "Failed to create RML server" ); +} + +thread_dispatcher::~thread_dispatcher() { + poison_pointer(my_server); +} + +thread_dispatcher_client* thread_dispatcher::select_next_client(thread_dispatcher_client* hint) { + unsigned next_client_priority_level = num_priority_levels; + if (hint) { + next_client_priority_level = hint->priority_level(); + } + + for (unsigned idx = 0; idx < next_client_priority_level; ++idx) { + if (!my_client_list[idx].empty()) { + return &*my_client_list[idx].begin(); + } + } + + return hint; +} + +thread_dispatcher_client* thread_dispatcher::create_client(arena& a) { + return new (cache_aligned_allocate(sizeof(thread_dispatcher_client))) thread_dispatcher_client(a, my_clients_aba_epoch); +} + + +void thread_dispatcher::register_client(thread_dispatcher_client* client) { + client_list_mutex_type::scoped_lock lock(my_list_mutex); + insert_client(*client); +} + +bool thread_dispatcher::try_unregister_client(thread_dispatcher_client* client, std::uint64_t aba_epoch, unsigned priority) { + __TBB_ASSERT(client, nullptr); + // we hold reference to the server, so market cannot be destroyed at any moment here + __TBB_ASSERT(!is_poisoned(my_server), nullptr); + my_list_mutex.lock(); + for (auto& it : my_client_list[priority]) { + if (client == &it) { + if (it.get_aba_epoch() == aba_epoch) { + // Client is alive + // Acquire my_references to sync with threads that just left the arena + // Pay attention that references should be read before workers_requested because + // if references is no zero some other thread might call adjust_demand and lead to + // a race over workers_requested + if (!client->references() && !client->has_request()) { + // Client is abandoned. Destroy it. + remove_client(*client); + ++my_clients_aba_epoch; + + my_list_mutex.unlock(); + destroy_client(client); + + return true; + } + } + break; + } + } + my_list_mutex.unlock(); + return false; +} + +void thread_dispatcher::destroy_client(thread_dispatcher_client* client) { + client->~thread_dispatcher_client(); + cache_aligned_deallocate(client); +} + +// Should be called under lock +void thread_dispatcher::insert_client(thread_dispatcher_client& client) { + __TBB_ASSERT(client.priority_level() < num_priority_levels, nullptr); + my_client_list[client.priority_level()].push_front(client); + + __TBB_ASSERT(!my_next_client || my_next_client->priority_level() < num_priority_levels, nullptr); + my_next_client = select_next_client(my_next_client); +} + +// Should be called under lock +void thread_dispatcher::remove_client(thread_dispatcher_client& client) { + __TBB_ASSERT(client.priority_level() < num_priority_levels, nullptr); + my_client_list[client.priority_level()].remove(client); + + if (my_next_client == &client) { + my_next_client = nullptr; + } + my_next_client = select_next_client(my_next_client); +} + +bool thread_dispatcher::is_client_alive(thread_dispatcher_client* client) { + if (!client) { + return false; + } + + // Still cannot access internals of the client since the object itself might be destroyed. + for (auto& priority_list : my_client_list) { + for (auto& c : priority_list) { + if (client == &c) { + return true; + } + } + } + return false; +} + +thread_dispatcher_client* thread_dispatcher::client_in_need(client_list_type* clients, thread_dispatcher_client* hint) { + // TODO: make sure client with higher priority returned only if there are available slots in it. + hint = select_next_client(hint); + if (!hint) { + return nullptr; + } + + client_list_type::iterator it = hint; + unsigned curr_priority_level = hint->priority_level(); + __TBB_ASSERT(it != clients[curr_priority_level].end(), nullptr); + do { + thread_dispatcher_client& t = *it; + if (++it == clients[curr_priority_level].end()) { + do { + ++curr_priority_level %= num_priority_levels; + } while (clients[curr_priority_level].empty()); + it = clients[curr_priority_level].begin(); + } + if (t.try_join()) { + return &t; + } + } while (it != hint); + return nullptr; +} + +thread_dispatcher_client* thread_dispatcher::client_in_need(thread_dispatcher_client* prev) { + client_list_mutex_type::scoped_lock lock(my_list_mutex, /*is_writer=*/false); + if (is_client_alive(prev)) { + return client_in_need(my_client_list, prev); + } + return client_in_need(my_client_list, my_next_client); +} + +void thread_dispatcher::adjust_job_count_estimate(int delta) { + my_server->adjust_job_count_estimate(delta); +} + +void thread_dispatcher::release(bool blocking_terminate) { + my_join_workers = blocking_terminate; + my_server->request_close_connection(); +} + +void thread_dispatcher::process(job& j) { + thread_data& td = static_cast(j); + // td.my_last_client can be dead. Don't access it until client_in_need is called + thread_dispatcher_client* client = td.my_last_client; + for (int i = 0; i < 2; ++i) { + while ((client = client_in_need(client)) ) { + td.my_last_client = client; + client->process(td); + } + // Workers leave thread_dispatcher because there is no client in need. It can happen earlier than + // adjust_job_count_estimate() decreases my_slack and RML can put this thread to sleep. + // It might result in a busy-loop checking for my_slack<0 and calling this method instantly. + // the yield refines this spinning. + if ( !i ) { + yield(); + } + } +} + + +//! Used when RML asks for join mode during workers termination. +bool thread_dispatcher::must_join_workers() const { return my_join_workers; } + +//! Returns the requested stack size of worker threads. +std::size_t thread_dispatcher::worker_stack_size() const { return my_stack_size; } + +void thread_dispatcher::acknowledge_close_connection() { + my_threading_control.destroy(); +} + +::rml::job* thread_dispatcher::create_one_job() { + unsigned short index = ++my_first_unused_worker_idx; + __TBB_ASSERT(index > 0, nullptr); + ITT_THREAD_SET_NAME(_T("TBB Worker Thread")); + // index serves as a hint decreasing conflicts between workers when they migrate between arenas + thread_data* td = new (cache_aligned_allocate(sizeof(thread_data))) thread_data{ index, true }; + __TBB_ASSERT(index <= my_num_workers_hard_limit, nullptr); + my_threading_control.register_thread(*td); + return td; +} + +void thread_dispatcher::cleanup(job& j) { + my_threading_control.unregister_thread(static_cast(j)); + governor::auto_terminate(&j); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third_party/tbb/thread_dispatcher.h b/third_party/tbb/thread_dispatcher.h new file mode 100644 index 000000000..85f3d4766 --- /dev/null +++ b/third_party/tbb/thread_dispatcher.h @@ -0,0 +1,107 @@ +// clang-format off +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_thread_dispatcher_H +#define _TBB_thread_dispatcher_H + +#include "third_party/tbb/detail/_config.h" +#include "third_party/tbb/detail/_utils.h" +#include "third_party/tbb/rw_mutex.h" +#include "third_party/tbb/task_arena.h" + +#include "third_party/tbb/arena.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/thread_data.h" +#include "third_party/tbb/rml_tbb.h" +#include "third_party/tbb/thread_dispatcher_client.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class threading_control_impl; + +class thread_dispatcher : no_copy, rml::tbb_client { + using client_list_type = intrusive_list; + using client_list_mutex_type = d1::rw_mutex; +public: + thread_dispatcher(threading_control& tc, unsigned hard_limit, std::size_t stack_size); + ~thread_dispatcher(); + + thread_dispatcher_client* create_client(arena& a); + void register_client(thread_dispatcher_client* client); + bool try_unregister_client(thread_dispatcher_client* client, std::uint64_t aba_epoch, unsigned priority); + + void adjust_job_count_estimate(int delta); + void release(bool blocking_terminate); + void process(job& j) override; + //! Used when RML asks for join mode during workers termination. + bool must_join_workers() const; + //! Returns the requested stack size of worker threads. + std::size_t worker_stack_size() const; + +private: + version_type version () const override { return 0; } + unsigned max_job_count () const override { return my_num_workers_hard_limit; } + std::size_t min_stack_size () const override { return worker_stack_size(); } + void cleanup(job& j) override; + void acknowledge_close_connection() override; + ::rml::job* create_one_job() override; + + thread_dispatcher_client* select_next_client(thread_dispatcher_client* hint); + void destroy_client(thread_dispatcher_client* client); + void insert_client(thread_dispatcher_client& client); + void remove_client(thread_dispatcher_client& client); + bool is_client_alive(thread_dispatcher_client* client); + thread_dispatcher_client* client_in_need(client_list_type* clients, thread_dispatcher_client* hint); + thread_dispatcher_client* client_in_need(thread_dispatcher_client* prev); + + friend class threading_control_impl; + static constexpr unsigned num_priority_levels = d1::num_priority_levels; + client_list_mutex_type my_list_mutex; + client_list_type my_client_list[num_priority_levels]; + + thread_dispatcher_client* my_next_client{nullptr}; + + //! Shutdown mode + bool my_join_workers{false}; + + threading_control& my_threading_control; + + //! ABA prevention marker to assign to newly created clients + std::atomic my_clients_aba_epoch{0}; + + //! Maximal number of workers allowed for use by the underlying resource manager + /** It can't be changed after thread_dispatcher creation. **/ + unsigned my_num_workers_hard_limit{0}; + + //! Stack size of worker threads + std::size_t my_stack_size{0}; + + //! First unused index of worker + /** Used to assign indices to the new workers coming from RML **/ + std::atomic my_first_unused_worker_idx{0}; + + //! Pointer to the RML server object that services this TBB instance. + rml::tbb_server* my_server{nullptr}; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_thread_dispatcher_H diff --git a/third_party/tbb/thread_dispatcher_client.h b/third_party/tbb/thread_dispatcher_client.h new file mode 100644 index 000000000..7c95b5118 --- /dev/null +++ b/third_party/tbb/thread_dispatcher_client.h @@ -0,0 +1,65 @@ +// clang-format off +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_thread_dispatcher_client_H +#define _TBB_thread_dispatcher_client_H + +#include "third_party/tbb/detail/_intrusive_list_node.h" +#include "third_party/tbb/arena.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class thread_dispatcher_client : public d1::intrusive_list_node /* Need for list in thread pool */ { +public: + thread_dispatcher_client(arena& a, std::uint64_t aba_epoch) : my_arena(a), my_aba_epoch(aba_epoch) {} + + // Interface of communication with thread pool + bool try_join() { + return my_arena.try_join(); + } + void process(thread_data& td) { + my_arena.process(td); + } + + unsigned priority_level() { + return my_arena.priority_level(); + } + + std::uint64_t get_aba_epoch() { + return my_aba_epoch; + } + + unsigned references() { + return my_arena.references(); + } + + bool has_request() { + return my_arena.has_request(); + } + +private: + arena& my_arena; + std::uint64_t my_aba_epoch; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_thread_dispatcher_client_H diff --git a/third_party/tbb/thread_request_serializer.cpp b/third_party/tbb/thread_request_serializer.cpp new file mode 100644 index 000000000..534f720ce --- /dev/null +++ b/third_party/tbb/thread_request_serializer.cpp @@ -0,0 +1,139 @@ +// clang-format off +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/misc.h" +#include "third_party/tbb/thread_request_serializer.h" + +namespace tbb { +namespace detail { +namespace r1 { + +thread_request_serializer::thread_request_serializer(thread_dispatcher& td, int soft_limit) + : my_thread_dispatcher(td) + , my_soft_limit(soft_limit) +{} + +void thread_request_serializer::update(int delta) { + constexpr std::uint64_t delta_mask = (pending_delta_base << 1) - 1; + constexpr std::uint64_t counter_value = delta_mask + 1; + + int prev_pending_delta = my_pending_delta.fetch_add(counter_value + delta); + + // There is a pseudo request aggregator, so only thread that see pending_delta_base in my_pending_delta + // Will enter to critical section and call adjust_job_count_estimate + if (prev_pending_delta == pending_delta_base) { + delta = int(my_pending_delta.exchange(pending_delta_base) & delta_mask) - int(pending_delta_base); + mutex_type::scoped_lock lock(my_mutex); + my_total_request += delta; + delta = limit_delta(delta, my_soft_limit, my_total_request); + my_thread_dispatcher.adjust_job_count_estimate(delta); + } +} + +void thread_request_serializer::set_active_num_workers(int soft_limit) { + mutex_type::scoped_lock lock(my_mutex); + int delta = soft_limit - my_soft_limit; + delta = limit_delta(delta, my_total_request, soft_limit); + my_thread_dispatcher.adjust_job_count_estimate(delta); + my_soft_limit = soft_limit; +} + +int thread_request_serializer::limit_delta(int delta, int limit, int new_value) { + // This method can be described with such pseudocode: + // bool above_limit = prev_value >= limit && new_value >= limit; + // bool below_limit = prev_value <= limit && new_value <= limit; + // enum request_type { ABOVE_LIMIT, CROSS_LIMIT, BELOW_LIMIT }; + // request = above_limit ? ABOVE_LIMIT : below_limit ? BELOW_LIMIT : CROSS_LIMIT; + + // switch (request) { + // case ABOVE_LIMIT: + // delta = 0; + // case CROSS_LIMIT: + // delta = delta > 0 ? limit - prev_value : new_value - limit; + // case BELOW_LIMIT: + // // No changes to delta + // } + + int prev_value = new_value - delta; + + // actual new_value and prev_value cannot exceed the limit + new_value = min(limit, new_value); + prev_value = min(limit, prev_value); + return new_value - prev_value; +} + + +thread_request_serializer_proxy::thread_request_serializer_proxy(thread_dispatcher& td, int soft_limit) : my_serializer(td, soft_limit) +{} + +void thread_request_serializer_proxy::register_mandatory_request(int mandatory_delta) { + if (mandatory_delta != 0) { + mutex_type::scoped_lock lock(my_mutex, /* is_write = */ false); + int prev_value = my_num_mandatory_requests.fetch_add(mandatory_delta); + + const bool should_try_enable = mandatory_delta > 0 && prev_value == 0; + const bool should_try_disable = mandatory_delta < 0 && prev_value == 1; + + if (should_try_enable) { + enable_mandatory_concurrency(lock); + } else if (should_try_disable) { + disable_mandatory_concurrency(lock); + } + } +} + +void thread_request_serializer_proxy::set_active_num_workers(int soft_limit) { + mutex_type::scoped_lock lock(my_mutex, /* is_write = */ true); + + if (soft_limit != 0) { + my_is_mandatory_concurrency_enabled = false; + my_serializer.set_active_num_workers(soft_limit); + } else { + if (my_num_mandatory_requests > 0 && !my_is_mandatory_concurrency_enabled) { + my_is_mandatory_concurrency_enabled = true; + my_serializer.set_active_num_workers(1); + } + } +} + +void thread_request_serializer_proxy::update(int delta) { my_serializer.update(delta); } + +void thread_request_serializer_proxy::enable_mandatory_concurrency(mutex_type::scoped_lock& lock) { + lock.upgrade_to_writer(); + bool still_should_enable = my_num_mandatory_requests.load(std::memory_order_relaxed) > 0 && + !my_is_mandatory_concurrency_enabled && my_serializer.is_no_workers_avaliable(); + + if (still_should_enable) { + my_is_mandatory_concurrency_enabled = true; + my_serializer.set_active_num_workers(1); + } +} + +void thread_request_serializer_proxy::disable_mandatory_concurrency(mutex_type::scoped_lock& lock) { + lock.upgrade_to_writer(); + bool still_should_disable = my_num_mandatory_requests.load(std::memory_order_relaxed) <= 0 && + my_is_mandatory_concurrency_enabled && !my_serializer.is_no_workers_avaliable(); + + if (still_should_disable) { + my_is_mandatory_concurrency_enabled = false; + my_serializer.set_active_num_workers(0); + } +} + +} // r1 +} // detail +} // tbb diff --git a/third_party/tbb/thread_request_serializer.h b/third_party/tbb/thread_request_serializer.h new file mode 100644 index 000000000..2c633853a --- /dev/null +++ b/third_party/tbb/thread_request_serializer.h @@ -0,0 +1,83 @@ +// clang-format off +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_thread_serializer_handlers_H +#define _TBB_thread_serializer_handlers_H + +#include "third_party/tbb/mutex.h" +#include "third_party/tbb/rw_mutex.h" + +#include "third_party/tbb/thread_dispatcher.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class thread_request_observer { +protected: + virtual ~thread_request_observer() {} +public: + virtual void update(int delta) = 0; +}; + + +class thread_request_serializer : public thread_request_observer { + using mutex_type = d1::mutex; +public: + thread_request_serializer(thread_dispatcher& td, int soft_limit); + void set_active_num_workers(int soft_limit); + bool is_no_workers_avaliable() { return my_soft_limit == 0; } + +private: + friend class thread_request_serializer_proxy; + void update(int delta) override; + static int limit_delta(int delta, int limit, int new_value); + + thread_dispatcher& my_thread_dispatcher; + int my_soft_limit{ 0 }; + int my_total_request{ 0 }; + // my_pending_delta is set to pending_delta_base to have ability to hold negative values + // consider increase base since thead number will be bigger than 1 << 15 + static constexpr std::uint64_t pending_delta_base = 1 << 15; + std::atomic my_pending_delta{ pending_delta_base }; + mutex_type my_mutex; +}; + +// Handles mandatory concurrency i.e. enables worker threads for enqueue tasks +class thread_request_serializer_proxy : public thread_request_observer { + using mutex_type = d1::rw_mutex; +public: + thread_request_serializer_proxy(thread_dispatcher& td, int soft_limit); + void register_mandatory_request(int mandatory_delta); + void set_active_num_workers(int soft_limit); + +private: + void update(int delta) override; + void enable_mandatory_concurrency(mutex_type::scoped_lock& lock); + void disable_mandatory_concurrency(mutex_type::scoped_lock& lock); + + std::atomic my_num_mandatory_requests{0}; + bool my_is_mandatory_concurrency_enabled{false}; + thread_request_serializer my_serializer; + mutex_type my_mutex; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_thread_serializer_handlers_H diff --git a/third_party/tbb/threading_control.cpp b/third_party/tbb/threading_control.cpp new file mode 100644 index 000000000..9f48853ed --- /dev/null +++ b/third_party/tbb/threading_control.cpp @@ -0,0 +1,392 @@ +// clang-format off +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/threading_control.h" +#include "third_party/tbb/permit_manager.h" +#include "third_party/tbb/market.h" +#include "third_party/tbb/thread_dispatcher.h" +#include "third_party/tbb/governor.h" +#include "third_party/tbb/thread_dispatcher_client.h" + +namespace tbb { +namespace detail { +namespace r1 { + +// ---------------------------------------- threading_control_impl -------------------------------------------------------------- + +std::size_t global_control_active_value_unsafe(d1::global_control::parameter); + +std::pair threading_control_impl::calculate_workers_limits() { + // Expecting that 4P is suitable for most applications. + // Limit to 2P for large thread number. + // TODO: ask RML for max concurrency and possibly correct hard_limit + unsigned factor = governor::default_num_threads() <= 128 ? 4 : 2; + + // The requested number of threads is intentionally not considered in + // computation of the hard limit, in order to separate responsibilities + // and avoid complicated interactions between global_control and task_scheduler_init. + // The threading control guarantees that at least 256 threads might be created. + unsigned workers_app_limit = global_control_active_value_unsafe(global_control::max_allowed_parallelism); + unsigned workers_hard_limit = max(max(factor * governor::default_num_threads(), 256u), workers_app_limit); + unsigned workers_soft_limit = calc_workers_soft_limit(workers_hard_limit); + + return std::make_pair(workers_soft_limit, workers_hard_limit); +} + +unsigned threading_control_impl::calc_workers_soft_limit(unsigned workers_hard_limit) { + unsigned workers_soft_limit{}; + unsigned soft_limit = global_control_active_value_unsafe(global_control::max_allowed_parallelism); + + // if user set no limits (yet), use default value + workers_soft_limit = soft_limit != 0 ? soft_limit - 1 : governor::default_num_threads() - 1; + + if (workers_soft_limit >= workers_hard_limit) { + workers_soft_limit = workers_hard_limit - 1; + } + + return workers_soft_limit; +} + +cache_aligned_unique_ptr threading_control_impl::make_permit_manager(unsigned workers_soft_limit) { + return make_cache_aligned_unique(workers_soft_limit); +} + +cache_aligned_unique_ptr threading_control_impl::make_thread_dispatcher(threading_control& tc, + unsigned workers_soft_limit, + unsigned workers_hard_limit) +{ + stack_size_type stack_size = global_control_active_value_unsafe(global_control::thread_stack_size); + + cache_aligned_unique_ptr td = + make_cache_aligned_unique(tc, workers_hard_limit, stack_size); + // This check relies on the fact that for shared RML default_concurrency == max_concurrency + if (!governor::UsePrivateRML && td->my_server->default_concurrency() < workers_soft_limit) { + runtime_warning("RML might limit the number of workers to %u while %u is requested.\n", + td->my_server->default_concurrency(), workers_soft_limit); + } + + return td; +} + +threading_control_impl::threading_control_impl(threading_control* tc) { + unsigned workers_soft_limit{}, workers_hard_limit{}; + std::tie(workers_soft_limit, workers_hard_limit) = calculate_workers_limits(); + + my_permit_manager = make_permit_manager(workers_soft_limit); + my_thread_dispatcher = make_thread_dispatcher(*tc, workers_soft_limit, workers_hard_limit); + my_thread_request_serializer = + make_cache_aligned_unique(*my_thread_dispatcher, workers_soft_limit); + my_permit_manager->set_thread_request_observer(*my_thread_request_serializer); + + my_cancellation_disseminator = make_cache_aligned_unique(); + my_waiting_threads_monitor = make_cache_aligned_unique(); +} + +void threading_control_impl::release(bool blocking_terminate) { + my_thread_dispatcher->release(blocking_terminate); +} + +void threading_control_impl::set_active_num_workers(unsigned soft_limit) { + __TBB_ASSERT(soft_limit <= my_thread_dispatcher->my_num_workers_hard_limit, nullptr); + my_thread_request_serializer->set_active_num_workers(soft_limit); + my_permit_manager->set_active_num_workers(soft_limit); +} + +threading_control_client threading_control_impl::create_client(arena& a) { + pm_client* pm_client = my_permit_manager->create_client(a); + thread_dispatcher_client* td_client = my_thread_dispatcher->create_client(a); + + return threading_control_client{pm_client, td_client}; +} + +threading_control_impl::client_snapshot threading_control_impl::prepare_client_destruction(threading_control_client client) { + auto td_client = client.get_thread_dispatcher_client(); + return {td_client->get_aba_epoch(), td_client->priority_level(), td_client, client.get_pm_client()}; +} + +bool threading_control_impl::try_destroy_client(threading_control_impl::client_snapshot snapshot) { + if (my_thread_dispatcher->try_unregister_client(snapshot.my_td_client, snapshot.aba_epoch, snapshot.priority_level)) { + my_permit_manager->unregister_and_destroy_client(*snapshot.my_pm_client); + return true; + } + return false; +} + +void threading_control_impl::publish_client(threading_control_client tc_client) { + my_permit_manager->register_client(tc_client.get_pm_client()); + my_thread_dispatcher->register_client(tc_client.get_thread_dispatcher_client()); +} + +void threading_control_impl::register_thread(thread_data& td) { + my_cancellation_disseminator->register_thread(td); +} +void threading_control_impl::unregister_thread(thread_data& td) { + my_cancellation_disseminator->unregister_thread(td); +} + +void threading_control_impl::propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, + d1::task_group_context& src, uint32_t new_state) +{ + my_cancellation_disseminator->propagate_task_group_state(mptr_state, src, new_state); +} + +std::size_t threading_control_impl::worker_stack_size() { + return my_thread_dispatcher->worker_stack_size(); +} + +unsigned threading_control_impl::max_num_workers() { + return my_thread_dispatcher->my_num_workers_hard_limit; +} + +void threading_control_impl::adjust_demand(threading_control_client tc_client, int mandatory_delta, int workers_delta) { + auto& c = *tc_client.get_pm_client(); + my_thread_request_serializer->register_mandatory_request(mandatory_delta); + my_permit_manager->adjust_demand(c, mandatory_delta, workers_delta); +} + +thread_control_monitor& threading_control_impl::get_waiting_threads_monitor() { + return *my_waiting_threads_monitor; +} + +// ---------------------------------------- threading_control ------------------------------------------------------------------- + +// Defined in global_control.cpp +void global_control_lock(); +void global_control_unlock(); + +void threading_control::add_ref(bool is_public) { + ++my_ref_count; + if (is_public) { + my_public_ref_count++; + } +} + +bool threading_control::remove_ref(bool is_public) { + if (is_public) { + __TBB_ASSERT(g_threading_control == this, "Global threading control instance was destroyed prematurely?"); + __TBB_ASSERT(my_public_ref_count.load(std::memory_order_relaxed), nullptr); + --my_public_ref_count; + } + + bool is_last_ref = --my_ref_count == 0; + if (is_last_ref) { + __TBB_ASSERT(!my_public_ref_count.load(std::memory_order_relaxed), nullptr); + g_threading_control = nullptr; + } + + return is_last_ref; +} + +threading_control* threading_control::get_threading_control(bool is_public) { + threading_control* control = g_threading_control; + if (control) { + control->add_ref(is_public); + } + + return control; +} + +threading_control* threading_control::create_threading_control() { + // Global control should be locked before threading_control_impl + global_control_lock(); + + threading_control* thr_control{ nullptr }; + try_call([&] { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + + thr_control = get_threading_control(/*public = */ true); + if (thr_control == nullptr) { + thr_control = new (cache_aligned_allocate(sizeof(threading_control))) threading_control(/*public_ref = */ 1, /*private_ref = */ 1); + thr_control->my_pimpl = make_cache_aligned_unique(thr_control); + + __TBB_InitOnce::add_ref(); + + if (global_control_active_value_unsafe(global_control::scheduler_handle)) { + ++thr_control->my_public_ref_count; + ++thr_control->my_ref_count; + } + + g_threading_control = thr_control; + } + }).on_exception([&] { + global_control_unlock(); + + cache_aligned_deleter deleter{}; + deleter(thr_control); + }); + + global_control_unlock(); + return thr_control; +} + +void threading_control::destroy () { + cache_aligned_deleter deleter; + deleter(this); + __TBB_InitOnce::remove_ref(); +} + +void threading_control::wait_last_reference(global_mutex_type::scoped_lock& lock) { + while (my_public_ref_count.load(std::memory_order_relaxed) == 1 && my_ref_count.load(std::memory_order_relaxed) > 1) { + lock.release(); + // To guarantee that request_close_connection() is called by the last external thread, we need to wait till all + // references are released. Re-read my_public_ref_count to limit waiting if new external threads are created. + // Theoretically, new private references to the threading control can be added during waiting making it potentially + // endless. + // TODO: revise why the weak scheduler needs threading control's pointer and try to remove this wait. + // Note that the threading control should know about its schedulers for cancellation/exception/priority propagation, + // see e.g. task_group_context::cancel_group_execution() + while (my_public_ref_count.load(std::memory_order_acquire) == 1 && my_ref_count.load(std::memory_order_acquire) > 1) { + yield(); + } + lock.acquire(g_threading_control_mutex); + } +} + +bool threading_control::release(bool is_public, bool blocking_terminate) { + bool do_release = false; + { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + if (blocking_terminate) { + __TBB_ASSERT(is_public, "Only an object with a public reference can request the blocking terminate"); + wait_last_reference(lock); + } + do_release = remove_ref(is_public); + } + + if (do_release) { + __TBB_ASSERT(!my_public_ref_count.load(std::memory_order_relaxed), "No public references must remain if we remove the threading control."); + // inform RML that blocking termination is required + my_pimpl->release(blocking_terminate); + return blocking_terminate; + } + return false; +} + +threading_control::threading_control(unsigned public_ref, unsigned ref) : my_public_ref_count(public_ref), my_ref_count(ref) +{} + +threading_control* threading_control::register_public_reference() { + threading_control* control{nullptr}; + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + control = get_threading_control(/*public = */ true); + if (!control) { + // We are going to create threading_control_impl, we should acquire mutexes in right order + lock.release(); + control = create_threading_control(); + } + + return control; +} + +bool threading_control::unregister_public_reference(bool blocking_terminate) { + __TBB_ASSERT(g_threading_control, "Threading control should exist until last public reference"); + __TBB_ASSERT(g_threading_control->my_public_ref_count.load(std::memory_order_relaxed), nullptr); + return g_threading_control->release(/*public = */ true, /*blocking_terminate = */ blocking_terminate); +} + +threading_control_client threading_control::create_client(arena& a) { + { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + add_ref(/*public = */ false); + } + + return my_pimpl->create_client(a); +} + +void threading_control::publish_client(threading_control_client client) { + return my_pimpl->publish_client(client); +} + +threading_control::client_snapshot threading_control::prepare_client_destruction(threading_control_client client) { + return my_pimpl->prepare_client_destruction(client); +} + +bool threading_control::try_destroy_client(threading_control::client_snapshot deleter) { + bool res = my_pimpl->try_destroy_client(deleter); + if (res) { + release(/*public = */ false, /*blocking_terminate = */ false); + } + return res; +} + +void threading_control::set_active_num_workers(unsigned soft_limit) { + threading_control* thr_control = get_threading_control(/*public = */ false); + if (thr_control != nullptr) { + thr_control->my_pimpl->set_active_num_workers(soft_limit); + thr_control->release(/*is_public=*/false, /*blocking_terminate=*/false); + } +} + +bool threading_control::is_present() { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + return g_threading_control != nullptr; +} + +bool threading_control::register_lifetime_control() { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + return get_threading_control(/*public = */ true) != nullptr; +} + +bool threading_control::unregister_lifetime_control(bool blocking_terminate) { + threading_control* thr_control{nullptr}; + { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + thr_control = g_threading_control; + } + + bool released{true}; + if (thr_control) { + released = thr_control->release(/*public = */ true, /*blocking_terminate = */ blocking_terminate); + } + + return released; +} + +void threading_control::register_thread(thread_data& td) { + my_pimpl->register_thread(td); +} + +void threading_control::unregister_thread(thread_data& td) { + my_pimpl->unregister_thread(td); +} + +void threading_control::propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, + d1::task_group_context& src, uint32_t new_state) +{ + my_pimpl->propagate_task_group_state(mptr_state, src, new_state); +} + +std::size_t threading_control::worker_stack_size() { + return my_pimpl->worker_stack_size(); +} + +unsigned threading_control::max_num_workers() { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + return g_threading_control ? g_threading_control->my_pimpl->max_num_workers() : 0; +} + +void threading_control::adjust_demand(threading_control_client client, int mandatory_delta, int workers_delta) { + my_pimpl->adjust_demand(client, mandatory_delta, workers_delta); +} + +thread_control_monitor& threading_control::get_waiting_threads_monitor() { + return my_pimpl->get_waiting_threads_monitor(); +} + +} // r1 +} // detail +} // tbb diff --git a/third_party/tbb/threading_control.h b/third_party/tbb/threading_control.h new file mode 100644 index 000000000..b42c9dd42 --- /dev/null +++ b/third_party/tbb/threading_control.h @@ -0,0 +1,153 @@ +// clang-format off +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_threading_control_H +#define _TBB_threading_control_H + +#include "third_party/tbb/mutex.h" +#include "third_party/tbb/global_control.h" + +#include "third_party/tbb/threading_control_client.h" +#include "third_party/tbb/intrusive_list.h" +#include "third_party/tbb/main.h" +#include "third_party/tbb/permit_manager.h" +#include "third_party/tbb/pm_client.h" +#include "third_party/tbb/thread_dispatcher.h" +#include "third_party/tbb/cancellation_disseminator.h" +#include "third_party/tbb/thread_request_serializer.h" +#include "third_party/tbb/scheduler_common.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class arena; +class thread_data; + +class threading_control; + +class threading_control_impl { +public: + threading_control_impl(threading_control*); + +public: + void release(bool blocking_terminate); + + threading_control_client create_client(arena& a); + void publish_client(threading_control_client client); + + struct client_snapshot { + std::uint64_t aba_epoch; + unsigned priority_level; + thread_dispatcher_client* my_td_client; + pm_client* my_pm_client; + }; + + client_snapshot prepare_client_destruction(threading_control_client client); + bool try_destroy_client(client_snapshot deleter); + + void register_thread(thread_data& td); + void unregister_thread(thread_data& td); + void propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, + d1::task_group_context& src, uint32_t new_state); + + void set_active_num_workers(unsigned soft_limit); + std::size_t worker_stack_size(); + unsigned max_num_workers(); + + void adjust_demand(threading_control_client, int mandatory_delta, int workers_delta); + + thread_control_monitor& get_waiting_threads_monitor(); + +private: + static unsigned calc_workers_soft_limit(unsigned workers_hard_limit); + static std::pair calculate_workers_limits(); + static cache_aligned_unique_ptr make_permit_manager(unsigned workers_soft_limit); + static cache_aligned_unique_ptr make_thread_dispatcher(threading_control& control, + unsigned workers_soft_limit, + unsigned workers_hard_limit); + + // TODO: Consider allocation one chunk of memory and construct objects on it + cache_aligned_unique_ptr my_permit_manager{nullptr}; + cache_aligned_unique_ptr my_thread_dispatcher{nullptr}; + cache_aligned_unique_ptr my_thread_request_serializer{nullptr}; + cache_aligned_unique_ptr my_cancellation_disseminator{nullptr}; + cache_aligned_unique_ptr my_waiting_threads_monitor{nullptr}; +}; + + +class threading_control { + using global_mutex_type = d1::mutex; +public: + using client_snapshot = threading_control_impl::client_snapshot; + + static threading_control* register_public_reference(); + static bool unregister_public_reference(bool blocking_terminate); + + static bool is_present(); + static void set_active_num_workers(unsigned soft_limit); + static bool register_lifetime_control(); + static bool unregister_lifetime_control(bool blocking_terminate); + + threading_control_client create_client(arena& a); + void publish_client(threading_control_client client); + client_snapshot prepare_client_destruction(threading_control_client client); + bool try_destroy_client(client_snapshot deleter); + + void register_thread(thread_data& td); + void unregister_thread(thread_data& td); + void propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, + d1::task_group_context& src, uint32_t new_state); + + std::size_t worker_stack_size(); + static unsigned max_num_workers(); + + void adjust_demand(threading_control_client client, int mandatory_delta, int workers_delta); + + thread_control_monitor& get_waiting_threads_monitor(); + +private: + threading_control(unsigned public_ref, unsigned ref); + void add_ref(bool is_public); + bool remove_ref(bool is_public); + + static threading_control* get_threading_control(bool is_public); + static threading_control* create_threading_control(); + + bool release(bool is_public, bool blocking_terminate); + void wait_last_reference(global_mutex_type::scoped_lock& lock); + void destroy(); + + friend class thread_dispatcher; + + static threading_control* g_threading_control; + //! Mutex guarding creation/destruction of g_threading_control, insertions/deletions in my_arenas, and cancellation propagation + static global_mutex_type g_threading_control_mutex; + + cache_aligned_unique_ptr my_pimpl{nullptr}; + //! Count of external threads attached + std::atomic my_public_ref_count{0}; + //! Reference count controlling threading_control object lifetime + std::atomic my_ref_count{0}; +}; + +} // r1 +} // detail +} // tbb + + +#endif // _TBB_threading_control_H diff --git a/third_party/tbb/threading_control_client.h b/third_party/tbb/threading_control_client.h new file mode 100644 index 000000000..941a9de3f --- /dev/null +++ b/third_party/tbb/threading_control_client.h @@ -0,0 +1,59 @@ +// clang-format off +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_threading_control_client_H +#define _TBB_threading_control_client_H + +#include "third_party/tbb/detail/_assert.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class pm_client; +class thread_dispatcher_client; + +class threading_control_client { +public: + threading_control_client() = default; + threading_control_client(const threading_control_client&) = default; + threading_control_client& operator=(const threading_control_client&) = default; + + threading_control_client(pm_client* p, thread_dispatcher_client* t) : my_pm_client(p), my_thread_dispatcher_client(t) { + __TBB_ASSERT(my_pm_client, nullptr); + __TBB_ASSERT(my_thread_dispatcher_client, nullptr); + } + + pm_client* get_pm_client() { + return my_pm_client; + } + + thread_dispatcher_client* get_thread_dispatcher_client() { + return my_thread_dispatcher_client; + } + +private: + pm_client* my_pm_client{nullptr}; + thread_dispatcher_client* my_thread_dispatcher_client{nullptr}; +}; + + +} +} +} + +#endif // _TBB_threading_control_client_H diff --git a/third_party/tbb/tick_count.h b/third_party/tbb/tick_count.h new file mode 100644 index 000000000..37880a9c4 --- /dev/null +++ b/third_party/tbb/tick_count.h @@ -0,0 +1,100 @@ +// clang-format off +/* + Copyright (c) 2005-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_tick_count_H +#define __TBB_tick_count_H + +#include "third_party/libcxx/chrono" + +#include "third_party/tbb/detail/_namespace_injection.h" + +namespace tbb { +namespace detail { +namespace d1 { + + +//! Absolute timestamp +/** @ingroup timing */ +class tick_count { +public: + using clock_type = typename std::conditional::type; + + //! Relative time interval. + class interval_t : public clock_type::duration { + public: + //! Construct a time interval representing zero time duration + interval_t() : clock_type::duration(clock_type::duration::zero()) {} + + //! Construct a time interval representing sec seconds time duration + explicit interval_t( double sec ) + : clock_type::duration(std::chrono::duration_cast(std::chrono::duration(sec))) {} + + //! Return the length of a time interval in seconds + double seconds() const { + return std::chrono::duration_cast>(*this).count(); + } + + //! Extract the intervals from the tick_counts and subtract them. + friend interval_t operator-( const tick_count& t1, const tick_count& t0 ); + + //! Add two intervals. + friend interval_t operator+( const interval_t& i, const interval_t& j ) { + return interval_t(std::chrono::operator+(i, j)); + } + + //! Subtract two intervals. + friend interval_t operator-( const interval_t& i, const interval_t& j ) { + return interval_t(std::chrono::operator-(i, j)); + } + + private: + explicit interval_t( clock_type::duration value_ ) : clock_type::duration(value_) {} + }; + + tick_count() = default; + + //! Return current time. + static tick_count now() { + return clock_type::now(); + } + + //! Subtract two timestamps to get the time interval between + friend interval_t operator-( const tick_count& t1, const tick_count& t0 ) { + return tick_count::interval_t(t1.my_time_point - t0.my_time_point); + } + + //! Return the resolution of the clock in seconds per tick. + static double resolution() { + return static_cast(interval_t::period::num) / interval_t::period::den; + } + +private: + clock_type::time_point my_time_point; + tick_count( clock_type::time_point tp ) : my_time_point(tp) {} +}; + +} // namespace d1 +} // namespace detail + +inline namespace v1 { + using detail::d1::tick_count; +} // namespace v1 + +} // namespace tbb + +#endif /* __TBB_tick_count_H */ diff --git a/third_party/tbb/tls.h b/third_party/tbb/tls.h new file mode 100644 index 000000000..7a143a915 --- /dev/null +++ b/third_party/tbb/tls.h @@ -0,0 +1,103 @@ +// clang-format off +/* + Copyright (c) 2005-2022 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_tls_H +#define _TBB_tls_H + +#include "third_party/tbb/detail/_config.h" + +#if __TBB_USE_POSIX +#include "libc/calls/weirdtypes.h" +#include "libc/sysv/consts/clock.h" +#include "libc/thread/thread.h" +#include "libc/thread/thread2.h" +#else /* assume __TBB_USE_WINAPI */ +#include "libc/nt/accounting.h" +#include "libc/nt/automation.h" +#include "libc/nt/console.h" +#include "libc/nt/debug.h" +#include "libc/nt/dll.h" +#include "libc/nt/enum/keyaccess.h" +#include "libc/nt/enum/regtype.h" +#include "libc/nt/errors.h" +#include "libc/nt/events.h" +#include "libc/nt/files.h" +#include "libc/nt/ipc.h" +#include "libc/nt/memory.h" +#include "libc/nt/paint.h" +#include "libc/nt/process.h" +#include "libc/nt/registry.h" +#include "libc/nt/synchronization.h" +#include "libc/nt/thread.h" +#include "libc/nt/windows.h" +#include "libc/nt/winsock.h" +#endif + +namespace tbb { +namespace detail { +namespace r1 { + +typedef void (*tls_dtor_t)(void*); + +//! Basic cross-platform wrapper class for TLS operations. +template +class basic_tls { +#if __TBB_USE_POSIX + typedef pthread_key_t tls_key_t; +public: + int create( tls_dtor_t dtor = nullptr ) { + return pthread_key_create(&my_key, dtor); + } + int destroy() { return pthread_key_delete(my_key); } + void set( T value ) { pthread_setspecific(my_key, (void*)value); } + T get() { return (T)pthread_getspecific(my_key); } +#else /* __TBB_USE_WINAPI */ + typedef DWORD tls_key_t; +public: +#if !__TBB_WIN8UI_SUPPORT + int create() { + tls_key_t tmp = TlsAlloc(); + if( tmp==TLS_OUT_OF_INDEXES ) + return TLS_OUT_OF_INDEXES; + my_key = tmp; + return 0; + } + int destroy() { TlsFree(my_key); my_key=0; return 0; } + void set( T value ) { TlsSetValue(my_key, (LPVOID)value); } + T get() { return (T)TlsGetValue(my_key); } +#else /*!__TBB_WIN8UI_SUPPORT*/ + int create() { + tls_key_t tmp = FlsAlloc(nullptr); + if( tmp== (DWORD)0xFFFFFFFF ) + return (DWORD)0xFFFFFFFF; + my_key = tmp; + return 0; + } + int destroy() { FlsFree(my_key); my_key=0; return 0; } + void set( T value ) { FlsSetValue(my_key, (LPVOID)value); } + T get() { return (T)FlsGetValue(my_key); } +#endif /* !__TBB_WIN8UI_SUPPORT */ +#endif /* __TBB_USE_WINAPI */ +private: + tls_key_t my_key; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* _TBB_tls_H */ diff --git a/third_party/tbb/version.cpp b/third_party/tbb/version.cpp new file mode 100644 index 000000000..d86164b2b --- /dev/null +++ b/third_party/tbb/version.cpp @@ -0,0 +1,27 @@ +// clang-format off +/* + Copyright (c) 2020-2021 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "third_party/tbb/version.h" + +extern "C" int TBB_runtime_interface_version() { + return TBB_INTERFACE_VERSION; +} + +extern "C" const char* TBB_runtime_version() { + static const char version_str[] = TBB_VERSION_STRING; + return version_str; +} diff --git a/third_party/tbb/version.h b/third_party/tbb/version.h new file mode 100644 index 000000000..eae21b2c4 --- /dev/null +++ b/third_party/tbb/version.h @@ -0,0 +1,115 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_version_H +#define __TBB_version_H + +// Exclude all includes during .rc files compilation +#ifndef RC_INVOKED + #include "third_party/tbb/detail/_config.h" + #include "third_party/tbb/detail/_namespace_injection.h" +#else + #define __TBB_STRING_AUX(x) #x + #define __TBB_STRING(x) __TBB_STRING_AUX(x) +#endif + +// Product version +#define TBB_VERSION_MAJOR 2021 +// Update version +#define TBB_VERSION_MINOR 10 +// "Patch" version for custom releases +#define TBB_VERSION_PATCH 0 +// Suffix string +#define __TBB_VERSION_SUFFIX "" +// Full official version string +#define TBB_VERSION_STRING __TBB_STRING(TBB_VERSION_MAJOR) "." __TBB_STRING(TBB_VERSION_MINOR) __TBB_VERSION_SUFFIX + +// OneAPI oneTBB specification version +#define ONETBB_SPEC_VERSION "1.0" +// Full interface version +#define TBB_INTERFACE_VERSION 12100 +// Major interface version +#define TBB_INTERFACE_VERSION_MAJOR (TBB_INTERFACE_VERSION/1000) +// Minor interface version +#define TBB_INTERFACE_VERSION_MINOR (TBB_INTERFACE_VERSION%1000/10) + +// The binary compatibility version +// To be used in SONAME, manifests, etc. +#define __TBB_BINARY_VERSION 12 + +//! TBB_VERSION support +#ifndef ENDL +#define ENDL "\n" +#endif + +//TBB_REVAMP_TODO: consider enabling version_string.ver generation +//TBB_REVAMP_TODO: // MISSING #include "version_string.ver" + +#define __TBB_ONETBB_SPEC_VERSION(N) #N ": SPECIFICATION VERSION\t" ONETBB_SPEC_VERSION ENDL +#define __TBB_VERSION_NUMBER(N) #N ": VERSION\t\t" TBB_VERSION_STRING ENDL +#define __TBB_INTERFACE_VERSION_NUMBER(N) #N ": INTERFACE VERSION\t" __TBB_STRING(TBB_INTERFACE_VERSION) ENDL + +#ifndef TBB_USE_DEBUG + #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\tundefined" ENDL +#elif TBB_USE_DEBUG==0 + #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t0" ENDL +#elif TBB_USE_DEBUG==1 + #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t1" ENDL +#elif TBB_USE_DEBUG==2 + #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t2" ENDL +#else + #error Unexpected value for TBB_USE_DEBUG +#endif + +#ifndef TBB_USE_ASSERT + #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\tundefined" ENDL +#elif TBB_USE_ASSERT==0 + #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t0" ENDL +#elif TBB_USE_ASSERT==1 + #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t1" ENDL +#elif TBB_USE_ASSERT==2 + #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t2" ENDL +#else + #error Unexpected value for TBB_USE_ASSERT +#endif + +#define TBB_VERSION_STRINGS_P(N) \ + __TBB_ONETBB_SPEC_VERSION(N) \ + __TBB_VERSION_NUMBER(N) \ + __TBB_INTERFACE_VERSION_NUMBER(N) \ + __TBB_VERSION_USE_DEBUG(N) \ + __TBB_VERSION_USE_ASSERT(N) + +#define TBB_VERSION_STRINGS TBB_VERSION_STRINGS_P(oneTBB) +#define TBBMALLOC_VERSION_STRINGS TBB_VERSION_STRINGS_P(TBBmalloc) + +//! The function returns the version string for the Intel(R) oneAPI Threading Building Blocks (oneTBB) +//! shared library being used. +/** + * The returned pointer is an address of a string in the shared library. + * It can be different than the TBB_VERSION_STRING obtained at compile time. + */ +extern "C" TBB_EXPORT const char* __TBB_EXPORTED_FUNC TBB_runtime_version(); + +//! The function returns the interface version of the oneTBB shared library being used. +/** + * The returned version is determined at runtime, not at compile/link time. + * It can be different than the value of TBB_INTERFACE_VERSION obtained at compile time. + */ +extern "C" TBB_EXPORT int __TBB_EXPORTED_FUNC TBB_runtime_interface_version(); + +#endif // __TBB_version_H diff --git a/third_party/tbb/waiters.h b/third_party/tbb/waiters.h new file mode 100644 index 000000000..d9ca0467b --- /dev/null +++ b/third_party/tbb/waiters.h @@ -0,0 +1,202 @@ +// clang-format off +/* + Copyright (c) 2005-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_waiters_H +#define _TBB_waiters_H + +#include "third_party/tbb/detail/_task.h" +#include "third_party/tbb/scheduler_common.h" +#include "third_party/tbb/arena.h" +#include "third_party/tbb/threading_control.h" + +namespace tbb { +namespace detail { +namespace r1 { + +inline d1::task* get_self_recall_task(arena_slot& slot); + +class waiter_base { +public: + waiter_base(arena& a, int yields_multiplier = 1) : my_arena(a), my_backoff(int(a.my_num_slots), yields_multiplier) {} + + bool pause() { + if (my_backoff.pause()) { + my_arena.out_of_work(); + return true; + } + + return false; + } + + void reset_wait() { + my_backoff.reset_wait(); + } + +protected: + arena& my_arena; + stealing_loop_backoff my_backoff; +}; + +class outermost_worker_waiter : public waiter_base { +public: + using waiter_base::waiter_base; + + bool continue_execution(arena_slot& slot, d1::task*& t) const { + __TBB_ASSERT(t == nullptr, nullptr); + + if (is_worker_should_leave(slot)) { + // Leave dispatch loop + return false; + } + + t = get_self_recall_task(slot); + return true; + } + + void pause(arena_slot&) { + waiter_base::pause(); + } + + + d1::wait_context* wait_ctx() { + return nullptr; + } + + static bool postpone_execution(d1::task&) { + return false; + } + +private: + using base_type = waiter_base; + + bool is_worker_should_leave(arena_slot& slot) const { + bool is_top_priority_arena = my_arena.is_top_priority(); + bool is_task_pool_empty = slot.task_pool.load(std::memory_order_relaxed) == EmptyTaskPool; + + if (is_top_priority_arena) { + // Worker in most priority arena do not leave arena, until all work in task_pool is done + if (is_task_pool_empty && my_arena.is_recall_requested()) { + return true; + } + } else { + if (my_arena.is_recall_requested()) { + // If worker has work in task pool, we must notify other threads, + // because can appear missed wake up of other threads + if (!is_task_pool_empty) { + my_arena.advertise_new_work(); + } + return true; + } + } + + return false; + } +}; + +class sleep_waiter : public waiter_base { +protected: + using waiter_base::waiter_base; + + template + void sleep(std::uintptr_t uniq_tag, Pred wakeup_condition) { + my_arena.get_waiting_threads_monitor().wait(wakeup_condition, + market_context{uniq_tag, &my_arena}); + } +}; + +class external_waiter : public sleep_waiter { +public: + external_waiter(arena& a, d1::wait_context& wo) + : sleep_waiter(a, /*yields_multiplier*/10), my_wait_ctx(wo) + {} + + bool continue_execution(arena_slot& slot, d1::task*& t) const { + __TBB_ASSERT(t == nullptr, nullptr); + if (!my_wait_ctx.continue_execution()) + return false; + t = get_self_recall_task(slot); + return true; + } + + void pause(arena_slot&) { + if (!sleep_waiter::pause()) { + return; + } + + auto wakeup_condition = [&] { return !my_arena.is_empty() || !my_wait_ctx.continue_execution(); }; + + sleep(std::uintptr_t(&my_wait_ctx), wakeup_condition); + my_backoff.reset_wait(); + } + + d1::wait_context* wait_ctx() { + return &my_wait_ctx; + } + + static bool postpone_execution(d1::task&) { + return false; + } + +private: + d1::wait_context& my_wait_ctx; +}; + +#if __TBB_RESUMABLE_TASKS + +class coroutine_waiter : public sleep_waiter { +public: + using sleep_waiter::sleep_waiter; + + bool continue_execution(arena_slot& slot, d1::task*& t) const { + __TBB_ASSERT(t == nullptr, nullptr); + t = get_self_recall_task(slot); + return true; + } + + void pause(arena_slot& slot) { + if (!sleep_waiter::pause()) { + return; + } + + suspend_point_type* sp = slot.default_task_dispatcher().m_suspend_point; + + auto wakeup_condition = [&] { return !my_arena.is_empty() || sp->m_is_owner_recalled.load(std::memory_order_relaxed); }; + + sleep(std::uintptr_t(sp), wakeup_condition); + my_backoff.reset_wait(); + } + + void reset_wait() { + my_backoff.reset_wait(); + } + + d1::wait_context* wait_ctx() { + return nullptr; + } + + static bool postpone_execution(d1::task& t) { + return task_accessor::is_resume_task(t); + } +}; + +#endif // __TBB_RESUMABLE_TASKS + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_waiters_H diff --git a/third_party/third_party.mk b/third_party/third_party.mk index 29062ce2f..56295ab3d 100644 --- a/third_party/third_party.mk +++ b/third_party/third_party.mk @@ -17,6 +17,7 @@ o/$(MODE)/third_party: \ o/$(MODE)/third_party/radpajama \ o/$(MODE)/third_party/hiredis \ o/$(MODE)/third_party/libcxx \ + o/$(MODE)/third_party/tbb \ o/$(MODE)/third_party/linenoise \ o/$(MODE)/third_party/lua \ o/$(MODE)/third_party/lz4cli \