diff --git a/test/libc/str/longsort_test.c b/test/libc/str/longsort_test.c index cf63f780d..ae672711f 100644 --- a/test/libc/str/longsort_test.c +++ b/test/libc/str/longsort_test.c @@ -16,6 +16,7 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/macros.internal.h" #include "libc/mem/alg.h" #include "libc/mem/gc.internal.h" #include "libc/mem/mem.h" @@ -25,8 +26,24 @@ #include "libc/str/str.h" #include "libc/testlib/ezbench.h" #include "libc/testlib/testlib.h" +#include "third_party/libcxx/learned_sort.h" +#include "third_party/libcxx/pdqsort.h" +#include "third_party/libcxx/ska_sort.h" #include "third_party/vqsort/vqsort.h" +void InsertionSort(long *A, long n) { + long i, j, t; + for (i = 1; i < n; i++) { + t = A[i]; + j = i - 1; + while (j >= 0 && A[j] > t) { + A[j + 1] = A[j]; + j = j - 1; + } + A[j + 1] = t; + } +} + int CompareLong(const void *a, const void *b) { const long *x = a; const long *y = b; @@ -110,25 +127,87 @@ BENCH(_longsort, bench) { long *p1 = gc(malloc(n * sizeof(long))); long *p2 = gc(malloc(n * sizeof(long))); rngset(p1, n * sizeof(long), 0, 0); - EZBENCH2("_longsort", memcpy(p2, p1, n * sizeof(long)), _longsort(p2, n)); + EZBENCH2("_longsort rand", memcpy(p2, p1, n * sizeof(long)), + _longsort(p2, n)); if (X86_HAVE(AVX2)) { - EZBENCH2("vqsort_int64_avx2", memcpy(p2, p1, n * sizeof(long)), + EZBENCH2("vq_int64_avx2 rand", memcpy(p2, p1, n * sizeof(long)), vqsort_int64_avx2(p2, n)); } if (X86_HAVE(SSE4_2)) { - EZBENCH2("vqsort_int64_sse4", memcpy(p2, p1, n * sizeof(long)), + EZBENCH2("vq_int64_sse4 rand", memcpy(p2, p1, n * sizeof(long)), vqsort_int64_sse4(p2, n)); } if (X86_HAVE(SSSE3)) { - EZBENCH2("vqsort_int64_ssse3", memcpy(p2, p1, n * sizeof(long)), + EZBENCH2("vq_int64_ssse3 rand", memcpy(p2, p1, n * sizeof(long)), vqsort_int64_ssse3(p2, n)); } - EZBENCH2("vqsort_int64_sse2", memcpy(p2, p1, n * sizeof(long)), + EZBENCH2("vq_int64_sse2 rand", memcpy(p2, p1, n * sizeof(long)), vqsort_int64_sse2(p2, n)); - EZBENCH2("radix_sort_int64", memcpy(p2, p1, n * sizeof(long)), + EZBENCH2("radix_int64 rand", memcpy(p2, p1, n * sizeof(long)), radix_sort_int64(p2, n)); - EZBENCH2("qsort(long)", memcpy(p2, p1, n * sizeof(long)), + EZBENCH2("ska_sort_int64 rand", memcpy(p2, p1, n * sizeof(long)), + ska_sort_int64(p2, n)); + EZBENCH2("pdq_int64 rand", memcpy(p2, p1, n * sizeof(long)), + pdqsort_int64(p2, n)); + EZBENCH2("pdq_branchless rand", memcpy(p2, p1, n * sizeof(long)), + pdqsort_branchless_int64(p2, n)); + EZBENCH2("learned_int64 rand", memcpy(p2, p1, n * sizeof(long)), + learned_sort_int64(p2, n)); + EZBENCH2("qsort(long) rand", memcpy(p2, p1, n * sizeof(long)), qsort(p2, n, sizeof(long), CompareLong)); + EZBENCH2("InsertionSort rand", memcpy(p2, p1, n * sizeof(long)), + InsertionSort(p2, n)); +} + +BENCH(_longsort, benchNearlySorted) { + printf("\n"); + long i, j, k, t; + long n = 5000; // total items + long m = 500; // misplaced items + long d = 5; // maximum drift + long *p1 = gc(malloc(n * sizeof(long))); + long *p2 = gc(malloc(n * sizeof(long))); + for (i = 0; i < n; ++i) { + p1[i] = i; + } + for (i = 0; i < m; ++i) { + j = rand() % n; + k = rand() % (d * 2) - d + j; + k = MIN(MAX(0, k), n - 1); + t = p1[j]; + p1[j] = p1[k]; + p1[k] = t; + } + EZBENCH2("_longsort near", memcpy(p2, p1, n * sizeof(long)), + _longsort(p2, n)); + if (X86_HAVE(AVX2)) { + EZBENCH2("vq_int64_avx2 near", memcpy(p2, p1, n * sizeof(long)), + vqsort_int64_avx2(p2, n)); + } + if (X86_HAVE(SSE4_2)) { + EZBENCH2("vq_int64_sse4 near", memcpy(p2, p1, n * sizeof(long)), + vqsort_int64_sse4(p2, n)); + } + if (X86_HAVE(SSSE3)) { + EZBENCH2("vq_int64_ssse3 near", memcpy(p2, p1, n * sizeof(long)), + vqsort_int64_ssse3(p2, n)); + } + EZBENCH2("vq_int64_sse2 near", memcpy(p2, p1, n * sizeof(long)), + vqsort_int64_sse2(p2, n)); + EZBENCH2("radix_int64 near", memcpy(p2, p1, n * sizeof(long)), + radix_sort_int64(p2, n)); + EZBENCH2("ska_sort_int64 near", memcpy(p2, p1, n * sizeof(long)), + ska_sort_int64(p2, n)); + EZBENCH2("pdq_int64 near", memcpy(p2, p1, n * sizeof(long)), + pdqsort_int64(p2, n)); + EZBENCH2("pdq_branchless near", memcpy(p2, p1, n * sizeof(long)), + pdqsort_branchless_int64(p2, n)); + EZBENCH2("learned_int64 near", memcpy(p2, p1, n * sizeof(long)), + learned_sort_int64(p2, n)); + EZBENCH2("qsort(long) near", memcpy(p2, p1, n * sizeof(long)), + qsort(p2, n, sizeof(long), CompareLong)); + EZBENCH2("InsertionSort near", memcpy(p2, p1, n * sizeof(long)), + InsertionSort(p2, n)); } int CompareInt(const void *a, const void *b) { @@ -215,6 +294,7 @@ BENCH(_intsort, bench) { int *p2 = gc(malloc(n * sizeof(int))); rngset(p1, n * sizeof(int), 0, 0); EZBENCH2("_intsort", memcpy(p2, p1, n * sizeof(int)), _intsort(p2, n)); + EZBENCH2("djbsort", memcpy(p2, p1, n * sizeof(int)), djbsort(p2, n)); if (X86_HAVE(AVX2)) { EZBENCH2("vqsort_int32_avx2", memcpy(p2, p1, n * sizeof(int)), vqsort_int32_avx2(p2, n)); @@ -229,7 +309,6 @@ BENCH(_intsort, bench) { } EZBENCH2("vqsort_int32_sse2", memcpy(p2, p1, n * sizeof(int)), vqsort_int32_sse2(p2, n)); - EZBENCH2("djbsort", memcpy(p2, p1, n * sizeof(int)), djbsort(p2, n)); EZBENCH2("radix_sort_int32", memcpy(p2, p1, n * sizeof(int)), radix_sort_int32(p2, n)); EZBENCH2("qsort(int)", memcpy(p2, p1, n * sizeof(int)), diff --git a/third_party/ggml/llama_util.h b/third_party/ggml/llama_util.h index 05184945d..2f7dbe0b2 100755 --- a/third_party/ggml/llama_util.h +++ b/third_party/ggml/llama_util.h @@ -1,6 +1,6 @@ +// -*- c++ -*- // Internal header to be included only by llama.cpp. // Contains wrappers around OS interfaces. - #ifndef LLAMA_UTIL_H #define LLAMA_UTIL_H #include "libc/calls/struct/rlimit.h" diff --git a/third_party/libcxx/learned_sort.cc b/third_party/libcxx/learned_sort.cc new file mode 100644 index 000000000..dbe198f8a --- /dev/null +++ b/third_party/libcxx/learned_sort.cc @@ -0,0 +1,25 @@ +/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│ +│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2023 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "third_party/libcxx/learned_sort.h" + +void learned_sort_int64(int64_t* A, size_t n) { + std::vector v(A, A + n); + learned_sort::sort(v.begin(), v.end()); + std::copy(v.begin(), v.end(), A); +} diff --git a/third_party/libcxx/learned_sort.h b/third_party/libcxx/learned_sort.h new file mode 100644 index 000000000..0c3a28e73 --- /dev/null +++ b/third_party/libcxx/learned_sort.h @@ -0,0 +1,1100 @@ +// -*- c++ -*- +#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_H_ +#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_H_ +#ifdef __cplusplus +#include "third_party/libcxx/algorithm" +#include "third_party/libcxx/cmath" +#include "third_party/libcxx/iterator" +#include "third_party/libcxx/vector" +#include "third_party/libcxx/rmi.h" +#include "third_party/libcxx/utils.h" +// clang-format off + +/** + * @file learned_sort.h + * @author Ani Kristo, Kapil Vaidya + * @brief The purpose of this file is to provide an implementation + * of Learned Sort, a model-enhanced sorting algorithm. + * + * @copyright Copyright (c) 2021 Ani Kristo + * @copyright Copyright (C) 2021 Kapil Vaidya + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +using namespace std; +using namespace learned_sort; + +namespace learned_sort { + +// Parameters +static constexpr int PRIMARY_FANOUT = 1000; +static constexpr int SECONDARY_FANOUT = 100; +static constexpr int PRIMARY_FRAGMENT_CAPACITY = 100; +static constexpr int SECONDARY_FRAGMENT_CAPACITY = 100; +static constexpr int REP_CNT_THRESHOLD = 5; + +// static struct timespec SubtractTime(struct timespec a, struct timespec b) { +// a.tv_sec -= b.tv_sec; +// if (a.tv_nsec < b.tv_nsec) { +// a.tv_nsec += 1000000000; +// a.tv_sec--; +// } +// a.tv_nsec -= b.tv_nsec; +// return a; +// } + +// static time_t ToNanoseconds(struct timespec ts) { +// return ts.tv_sec * 1000000000 + ts.tv_nsec; +// } + +template +void sort(RandomIt begin, RandomIt end, + TwoLayerRMI::value_type> &rmi) { + //----------------------------------------------------------// + // INIT // + //----------------------------------------------------------// + + // Determine the data type + typedef typename iterator_traits::value_type T; + + // Constants + const long input_sz = std::distance(begin, end); + const long TRAINING_SAMPLE_SZ = rmi.training_sample.size(); + + // Keeps track of the number of elements in each bucket + long primary_bucket_sizes[PRIMARY_FANOUT]{0}; + + // Counts the number of elements that are done going through the + // partitioning steps for good + long num_elms_finalized = 0; + + // Cache the model parameters + const long num_leaf_models = rmi.hp.num_leaf_models; + double root_slope = rmi.root_model.slope; + double root_intercept = rmi.root_model.intercept; + auto num_models = rmi.hp.num_leaf_models; + double slopes[num_leaf_models]; + double intercepts[num_leaf_models]; + for (auto i = 0; i < num_leaf_models; ++i) { + slopes[i] = rmi.leaf_models[i].slope; + intercepts[i] = rmi.leaf_models[i].intercept; + } + + // struct timespec ts1, ts2; + + //----------------------------------------------------------// + // PARTITION THE KEYS INTO BUCKETS // + //----------------------------------------------------------// + + // clock_gettime(CLOCK_REALTIME, &ts1); + { + // Keeps track of the number of elements in each fragment + long fragment_sizes[PRIMARY_FANOUT]{0}; + + // An auxiliary set of fragments where the elements will be partitioned + auto fragments = new T[PRIMARY_FANOUT][PRIMARY_FRAGMENT_CAPACITY]; + + // Keeps track of the number of fragments that have been written back to the + // original array + long fragments_written = 0; + + // Points to the next free space where to write back + auto write_itr = begin; + + // For each element in the input, predict which bucket it would go to, and + // insert to the respective bucket fragment + for (auto it = begin; it != end; ++it) { + // Predict the model id in the leaf layer of the RMI + long pred_bucket_idx = static_cast(std::max( + 0., + std::min(num_leaf_models - 1., root_slope * it[0] + root_intercept))); + + // Predict the CDF + double pred_cdf = + slopes[pred_bucket_idx] * it[0] + intercepts[pred_bucket_idx]; + + // Get the predicted bucket id + pred_bucket_idx = static_cast(std::max( + 0., std::min(PRIMARY_FANOUT - 1., pred_cdf * PRIMARY_FANOUT))); + + // Place the current element in the predicted fragment + fragments[pred_bucket_idx][fragment_sizes[pred_bucket_idx]] = it[0]; + + // Update the fragment size and the bucket size + primary_bucket_sizes[pred_bucket_idx]++; + fragment_sizes[pred_bucket_idx]++; + + if (fragment_sizes[pred_bucket_idx] == PRIMARY_FRAGMENT_CAPACITY) { + fragments_written++; + // The predicted fragment is full, place in the array and update bucket + // size + std::move(fragments[pred_bucket_idx], + fragments[pred_bucket_idx] + PRIMARY_FRAGMENT_CAPACITY, + write_itr); + write_itr += PRIMARY_FRAGMENT_CAPACITY; + + // Reset the fragment size + fragment_sizes[pred_bucket_idx] = 0; + } + } + + //----------------------------------------------------------// + // DEFRAGMENTATION // + //----------------------------------------------------------// + + // Records the ending offset for the buckets + long bucket_end_offset[PRIMARY_FANOUT]{0}; + bucket_end_offset[0] = primary_bucket_sizes[0]; + + // Swap space + T *swap_buffer = new T[PRIMARY_FRAGMENT_CAPACITY]; + + // Maintains a writing iterator for each bucket, initialized at the starting + // offsets + long bucket_write_off[PRIMARY_FANOUT]{0}; + bucket_write_off[0] = 0; + + // Calculate the starting and ending offsets of each bucket + for (long bucket_idx = 1; bucket_idx < PRIMARY_FANOUT; ++bucket_idx) { + // Calculate the bucket end offsets (prefix sum) + bucket_end_offset[bucket_idx] = + primary_bucket_sizes[bucket_idx] + bucket_end_offset[bucket_idx - 1]; + + // Calculate the bucket start offsets and assign the writing iterator to + // that value + + // These offsets are aligned w.r.t. PRIMARY_FRAGMENT_CAPACITY + bucket_write_off[bucket_idx] = ceil(bucket_end_offset[bucket_idx - 1] * + 1. / PRIMARY_FRAGMENT_CAPACITY) * + PRIMARY_FRAGMENT_CAPACITY; + + // Fence the bucket iterator. This might occur because the write offset is + // not necessarily aligned with PRIMARY_FRAGMENT_CAPACITY + if (bucket_write_off[bucket_idx] > bucket_end_offset[bucket_idx]) { + bucket_write_off[bucket_idx] = bucket_end_offset[bucket_idx]; + } + } + + // This keeps track of which bucket we are operating on + long stored_bucket_idx = 0; + + // Go over each fragment and re-arrange them so that they are placed + // contiguously within each bucket boundaries + for (long fragment_idx = 0; fragment_idx < fragments_written; + ++fragment_idx) { + auto cur_fragment_start_off = fragment_idx * PRIMARY_FRAGMENT_CAPACITY; + + // Find the bucket where this fragment is stored into, which is not + // necessarily the bucket it belongs to + while (cur_fragment_start_off >= bucket_end_offset[stored_bucket_idx]) { + ++stored_bucket_idx; + } + + // Skip this fragment if its starting index is lower than the writing + // offset for the current bucket. This means that the fragment has already + // been dealt with + if (cur_fragment_start_off < bucket_write_off[stored_bucket_idx]) { + continue; + } + + // Find out what bucket the current fragment belongs to by looking at RMI + // prediction for the first element of the fragment. + auto first_elm_in_fragment = begin[cur_fragment_start_off]; + long pred_bucket_for_cur_fragment = static_cast(std::max( + 0., std::min(num_leaf_models - 1., + root_slope * first_elm_in_fragment + root_intercept))); + + // Predict the CDF + double pred_cdf = + slopes[pred_bucket_for_cur_fragment] * first_elm_in_fragment + + intercepts[pred_bucket_for_cur_fragment]; + + // Get the predicted bucket id + pred_bucket_for_cur_fragment = static_cast(std::max( + 0., std::min(PRIMARY_FANOUT - 1., pred_cdf * PRIMARY_FANOUT))); + + // If the current bucket contains fragments that are not all the way full, + // no need to use a swap buffer, since there is available space. The first + // condition checks whether the current fragment will need to be written + // in a bucket that was already consumed. The second condition checks + // whether the writing offset of the predicted bucket is beyond the last + // element written into the array. This means that there is empty space to + // write the fragments to. + if (bucket_write_off[pred_bucket_for_cur_fragment] < + cur_fragment_start_off || + bucket_write_off[pred_bucket_for_cur_fragment] >= + fragments_written * PRIMARY_FRAGMENT_CAPACITY) { + // If the current fragment will not be the last one to write in the + // predicted bucket + if (bucket_write_off[pred_bucket_for_cur_fragment] + + PRIMARY_FRAGMENT_CAPACITY <= + bucket_end_offset[pred_bucket_for_cur_fragment]) { + auto write_itr = + begin + bucket_write_off[pred_bucket_for_cur_fragment]; + auto read_itr = begin + cur_fragment_start_off; + + // Move the elements of the fragment to the bucket's write offset + std::copy(read_itr, read_itr + PRIMARY_FRAGMENT_CAPACITY, write_itr); + + // Update the bucket write offset + bucket_write_off[pred_bucket_for_cur_fragment] += + PRIMARY_FRAGMENT_CAPACITY; + + } else { // This is the last fragment to write into the predicted + // bucket + auto write_itr = + begin + bucket_write_off[pred_bucket_for_cur_fragment]; + auto read_itr = begin + cur_fragment_start_off; + + // Calculate the fragment size + auto cur_fragment_sz = + bucket_end_offset[pred_bucket_for_cur_fragment] - + bucket_write_off[pred_bucket_for_cur_fragment]; + + // Move the elements of the fragment to the bucket's write offset + std::copy(read_itr, read_itr + cur_fragment_sz, write_itr); + + // Update the bucket write offset for the predicted bucket + bucket_write_off[pred_bucket_for_cur_fragment] = + bucket_end_offset[pred_bucket_for_cur_fragment]; + + // Place the remaining elements of this fragment into the auxiliary + // fragment memory. This is needed when the start of the bucket might + // have empty spaces because the writing iterator was not aligned with + // FRAGMENT_CAPACITY (not a multiple) + for (int elm_idx = cur_fragment_sz; + elm_idx < PRIMARY_FRAGMENT_CAPACITY; elm_idx++) { + fragments[pred_bucket_for_cur_fragment] + [fragment_sizes[pred_bucket_for_cur_fragment] + elm_idx - + cur_fragment_sz] = read_itr[elm_idx]; + } + + // Update the auxiliary fragment size + fragment_sizes[pred_bucket_for_cur_fragment] += + PRIMARY_FRAGMENT_CAPACITY - cur_fragment_sz; + } + + } else { // The current fragment is to be written in a non-empty space, + // so an incumbent fragment will need to be evicted + + // If the fragment is already within the correct bucket + if (pred_bucket_for_cur_fragment == stored_bucket_idx) { + // If the empty area left in the bucket is not enough for all the + // elements in the fragment. This is needed when the start of the + // bucket might have empty spaces because the writing iterator was not + // aligned with FRAGMENT_CAPACITY (not a multiple) + if (bucket_end_offset[stored_bucket_idx] - + bucket_write_off[stored_bucket_idx] < + PRIMARY_FRAGMENT_CAPACITY) { + auto write_itr = begin + bucket_write_off[stored_bucket_idx]; + auto read_itr = begin + cur_fragment_start_off; + + // Calculate the amount of space left for the current bucket + long remaining_space = bucket_end_offset[stored_bucket_idx] - + bucket_write_off[stored_bucket_idx]; + + // Write out the fragment in the remaining space + std::copy(read_itr, read_itr + remaining_space, write_itr); + + // Update the read iterator to point to the remaining elements + read_itr = begin + cur_fragment_start_off + remaining_space; + + // Calculate the fragment size + auto cur_fragment_sz = fragment_sizes[stored_bucket_idx]; + + // Write the remaining elements into the auxiliary fragment space + for (int k = 0; k < PRIMARY_FRAGMENT_CAPACITY - remaining_space; + ++k) { + fragments[stored_bucket_idx][cur_fragment_sz++] = *(read_itr++); + } + + // Update the fragment size after the new placements + fragment_sizes[stored_bucket_idx] = cur_fragment_sz; + + // Update the bucket write offset to indicate that the bucket was + // fully written + bucket_write_off[stored_bucket_idx] = + bucket_end_offset[stored_bucket_idx]; + + } else { // The bucket has enough space for the incoming fragment + + auto write_itr = begin + bucket_write_off[stored_bucket_idx]; + auto read_itr = begin + cur_fragment_start_off; + + if (write_itr != read_itr) { + // Write the elements to the bucket's write offset + std::copy(read_itr, read_itr + PRIMARY_FRAGMENT_CAPACITY, + write_itr); + } + + // Update the write offset for the current bucket + bucket_write_off[stored_bucket_idx] += PRIMARY_FRAGMENT_CAPACITY; + } + } else { // The fragment is not in the correct bucket and needs to be + // swapped out with an incorrectly placed fragment there + + // Predict the bucket of the fragment that will be swapped out + auto first_elm_in_fragment_to_be_swapped_out = + begin[bucket_write_off[pred_bucket_for_cur_fragment]]; + + long pred_bucket_for_fragment_to_be_swapped_out = + static_cast(std::max( + 0., std::min( + num_leaf_models - 1., + root_slope * first_elm_in_fragment_to_be_swapped_out + + root_intercept))); + + // Predict the CDF + double pred_cdf = + slopes[pred_bucket_for_fragment_to_be_swapped_out] * + first_elm_in_fragment_to_be_swapped_out + + intercepts[pred_bucket_for_fragment_to_be_swapped_out]; + + // Get the predicted bucket idx + pred_bucket_for_fragment_to_be_swapped_out = static_cast( + std::max(0., std::min(PRIMARY_FANOUT - 1., + pred_cdf * PRIMARY_FANOUT))); + + // If the fragment at the next write offset is not already in the + // right bucket, swap the fragments + if (pred_bucket_for_fragment_to_be_swapped_out != + pred_bucket_for_cur_fragment) { + // Move the contents at the write offset into a swap buffer + auto itr_buf1 = + begin + bucket_write_off[pred_bucket_for_cur_fragment]; + auto itr_buf2 = begin + cur_fragment_start_off; + std::copy(itr_buf1, itr_buf1 + PRIMARY_FRAGMENT_CAPACITY, + swap_buffer); + + // Write the contents of the incoming fragment + std::copy(itr_buf2, itr_buf2 + PRIMARY_FRAGMENT_CAPACITY, itr_buf1); + + // Place the swap buffer into the emptied space + std::copy(swap_buffer, swap_buffer + PRIMARY_FRAGMENT_CAPACITY, + itr_buf2); + + pred_bucket_for_cur_fragment = + pred_bucket_for_fragment_to_be_swapped_out; + } else { // The fragment at the write offset is already in the right + // bucket + bucket_write_off[pred_bucket_for_cur_fragment] += + PRIMARY_FRAGMENT_CAPACITY; + } + + // Decrement the fragment index so that the newly swapped in fragment + // is not skipped over + --fragment_idx; + } + } + } + + // Add the elements remaining in the auxiliary fragments to the buckets they + // belong to. This is for when the fragments weren't full and thus not + // flushed to the input array + for (long bucket_idx = 0; bucket_idx < PRIMARY_FANOUT; ++bucket_idx) { + // Set the writing offset to the beggining of the bucket + long write_off = bucket_end_offset[bucket_idx - 1]; + if (bucket_idx == 0) { + write_off = 0; + } + + // Add the elements left in the auxiliary fragments to the beggining of + // the predicted bucket, since it was not full + long elm_idx; + for (elm_idx = 0; (elm_idx < fragment_sizes[bucket_idx]) && + (write_off % PRIMARY_FRAGMENT_CAPACITY != 0); + ++elm_idx) { + begin[write_off++] = fragments[bucket_idx][elm_idx]; + } + + // Add the remaining elements from the auxiliary fragments to the end of + // the bucket it belongs to + write_off = bucket_end_offset[bucket_idx] - fragment_sizes[bucket_idx]; + for (; elm_idx < fragment_sizes[bucket_idx]; ++elm_idx) { + begin[write_off + elm_idx] = fragments[bucket_idx][elm_idx]; + ++bucket_write_off[bucket_idx]; + } + } + + // Cleanup + delete[] swap_buffer; + delete[] fragments; + } + // clock_gettime(CLOCK_REALTIME, &ts2); + // printf("PARTITION THE KEYS INTO BUCKETS %ld us\n", + // ToNanoseconds(SubtractTime(ts2, ts1)) / 1000); + + //----------------------------------------------------------// + // SECOND ROUND OF PARTITIONING // + //----------------------------------------------------------// + + // clock_gettime(CLOCK_REALTIME, &ts1); + { + // Iterate over each bucket starting from the end so that the merging step + // later is done in-place + auto primary_bucket_start = begin; + auto primary_bucket_end = begin; + for (long primary_bucket_idx = 0; primary_bucket_idx < PRIMARY_FANOUT; + ++primary_bucket_idx) { + auto primary_bucket_sz = primary_bucket_sizes[primary_bucket_idx]; + + // Skip bucket if empty + if (primary_bucket_sz == 0) continue; + + // Update the start and the end of the bucket data + primary_bucket_start = primary_bucket_end; + primary_bucket_end += primary_bucket_sz; + + // Check for homogeneity + bool is_homogeneous = true; + if (rmi.enable_dups_detection) { + for (long elm_idx = 1; elm_idx < primary_bucket_sz; ++elm_idx) { + if (primary_bucket_start[elm_idx] != + primary_bucket_start[elm_idx - 1]) { + is_homogeneous = false; + break; + } + } + } + + // When the bucket is homogeneous, skip sorting it + if (rmi.enable_dups_detection && is_homogeneous) { + num_elms_finalized += primary_bucket_sz; + + } + + // When the bucket is not homogeneous, and it's not flagged for duplicates + else { + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -// + // PARTITION THE KEYS INTO SECONDARY BUCKETS // + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -// + + // Keeps track of the number of elements in each secondary bucket + long secondary_bucket_sizes[SECONDARY_FANOUT]{0}; + + // Keeps track of the number of elements in each fragment + long fragment_sizes[SECONDARY_FANOUT]{0}; + + // An auxiliary set of fragments where the elements will be partitioned + auto fragments = new T[SECONDARY_FANOUT][SECONDARY_FRAGMENT_CAPACITY]; + + // Keeps track of the number of fragments that have been written back to + // the original array + long fragments_written = 0; + + // Points to the next free space where to write back + auto write_itr = primary_bucket_start; + + // For each element in the input, predict which bucket it would go to, + // and insert to the respective bucket fragment + for (auto it = primary_bucket_start; it != primary_bucket_end; ++it) { + // Predict the model id in the leaf layer of the RMI + long pred_bucket_idx = static_cast( + std::max(0., std::min(num_leaf_models - 1., + root_slope * it[0] + root_intercept))); + + // Predict the CDF + double pred_cdf = + slopes[pred_bucket_idx] * it[0] + intercepts[pred_bucket_idx]; + + // Get the predicted bucket id + pred_bucket_idx = static_cast(std::max( + 0., std::min(SECONDARY_FANOUT - 1., + (pred_cdf * PRIMARY_FANOUT - primary_bucket_idx) * + SECONDARY_FANOUT))); + + // Place the current element in the predicted fragment + fragments[pred_bucket_idx][fragment_sizes[pred_bucket_idx]] = it[0]; + + // Update the fragment size and the bucket size + ++secondary_bucket_sizes[pred_bucket_idx]; + ++fragment_sizes[pred_bucket_idx]; + + if (fragment_sizes[pred_bucket_idx] == SECONDARY_FRAGMENT_CAPACITY) { + ++fragments_written; + // The predicted fragment is full, place in the array and update + // bucket size + std::move(fragments[pred_bucket_idx], + fragments[pred_bucket_idx] + SECONDARY_FRAGMENT_CAPACITY, + write_itr); + write_itr += SECONDARY_FRAGMENT_CAPACITY; + + // Reset the fragment size + fragment_sizes[pred_bucket_idx] = 0; + } + } // end of partitioninig over secondary fragments + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -// + // DEFRAGMENTATION // + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -// + + // Records the ending offset for the buckets + long bucket_end_offset[SECONDARY_FANOUT]{0}; + bucket_end_offset[0] = secondary_bucket_sizes[0]; + + // Swap space + T *swap_buffer = new T[SECONDARY_FRAGMENT_CAPACITY]; + + // Maintains a writing iterator for each bucket, initialized at the + // starting offsets + long bucket_start_off[SECONDARY_FANOUT]{0}; + bucket_start_off[0] = 0; + + // Calculate the starting and ending offsets of each bucket + for (long bucket_idx = 1; bucket_idx < SECONDARY_FANOUT; ++bucket_idx) { + // Calculate the bucket end offsets (prefix sum) + bucket_end_offset[bucket_idx] = secondary_bucket_sizes[bucket_idx] + + bucket_end_offset[bucket_idx - 1]; + + // Calculate the bucket start offsets and assign the writing + // iterator to that value + + // These offsets are aligned w.r.t. SECONDARY_FRAGMENT_CAPACITY + bucket_start_off[bucket_idx] = + ceil(bucket_end_offset[bucket_idx - 1] * 1. / + SECONDARY_FRAGMENT_CAPACITY) * + SECONDARY_FRAGMENT_CAPACITY; + + // Fence the bucket iterator. This might occur because the write + // offset is not necessarily aligned with SECONDARY_FRAGMENT_CAPACITY + if (bucket_start_off[bucket_idx] > bucket_end_offset[bucket_idx]) { + bucket_start_off[bucket_idx] = bucket_end_offset[bucket_idx]; + } + } + + // This keeps track of which bucket we are operating on + long stored_bucket_idx = 0; + + // Go over each fragment and re-arrange them so that they are placed + // contiguously within each bucket boundaries + for (long fragment_idx = 0; fragment_idx < fragments_written; + ++fragment_idx) { + auto cur_fragment_start_off = + fragment_idx * SECONDARY_FRAGMENT_CAPACITY; + + // Find the bucket where this fragment is stored into, which is not + // necessarily the bucket it belongs to + while (cur_fragment_start_off >= + bucket_end_offset[stored_bucket_idx]) { + ++stored_bucket_idx; + } + + // Skip this fragment if its starting index is lower than the writing + // offset for the current bucket. This means that the fragment has + // already been dealt with + if (cur_fragment_start_off < bucket_start_off[stored_bucket_idx]) { + continue; + } + + // Find out what bucket the current fragment belongs to by looking at + // RMI prediction for the first element of the fragment. + auto first_elm_in_fragment = + primary_bucket_start[cur_fragment_start_off]; + long pred_bucket_for_cur_fragment = static_cast(std::max( + 0., + std::min(num_leaf_models - 1., + root_slope * first_elm_in_fragment + root_intercept))); + + // Predict the CDF + double pred_cdf = + slopes[pred_bucket_for_cur_fragment] * first_elm_in_fragment + + intercepts[pred_bucket_for_cur_fragment]; + + // Get the predicted bucket id + pred_bucket_for_cur_fragment = static_cast(std::max( + 0., std::min(SECONDARY_FANOUT - 1., + (pred_cdf * PRIMARY_FANOUT - primary_bucket_idx) * + SECONDARY_FANOUT))); + + // If the current bucket contains fragments that are not all the way + // full, no need to use a swap fragment, since there is available + // space. The first condition checks whether the current fragment will + // need to be written in a bucket that was already consumed. The + // second condition checks whether the writing offset of the + // predicted bucket is beyond the last element written into the + // array. This means that there is empty space to write the fragments + // to. + if (bucket_start_off[pred_bucket_for_cur_fragment] < + cur_fragment_start_off || + bucket_start_off[pred_bucket_for_cur_fragment] >= + fragments_written * SECONDARY_FRAGMENT_CAPACITY) { + // If the current fragment will not be the last one to write in the + // predicted bucket + if (bucket_start_off[pred_bucket_for_cur_fragment] + + SECONDARY_FRAGMENT_CAPACITY <= + bucket_end_offset[pred_bucket_for_cur_fragment]) { + auto write_itr = primary_bucket_start + + bucket_start_off[pred_bucket_for_cur_fragment]; + auto read_itr = primary_bucket_start + cur_fragment_start_off; + + // Move the elements of the fragment to the bucket's write offset + std::copy(read_itr, read_itr + SECONDARY_FRAGMENT_CAPACITY, + write_itr); + + // Update the bucket write offset + bucket_start_off[pred_bucket_for_cur_fragment] += + SECONDARY_FRAGMENT_CAPACITY; + + } else { // This is the last fragment to write into the predicted + // bucket + auto write_itr = primary_bucket_start + + bucket_start_off[pred_bucket_for_cur_fragment]; + auto read_itr = primary_bucket_start + cur_fragment_start_off; + + // Calculate the fragment size + auto cur_fragment_sz = + bucket_end_offset[pred_bucket_for_cur_fragment] - + bucket_start_off[pred_bucket_for_cur_fragment]; + + // Move the elements of the fragment to the bucket's write offset + std::copy(read_itr, read_itr + cur_fragment_sz, write_itr); + + // Update the bucket write offset for the predicted bucket + bucket_start_off[pred_bucket_for_cur_fragment] = + bucket_end_offset[pred_bucket_for_cur_fragment]; + + // Place the remaining elements of this fragment into the + // auxiliary fragment memory. This is needed when the start of the + // bucket might have empty spaces because the writing iterator + // was not aligned with FRAGMENT_CAPACITY (not a multiple) + for (long elm_idx = cur_fragment_sz; + elm_idx < SECONDARY_FRAGMENT_CAPACITY; elm_idx++) { + fragments[pred_bucket_for_cur_fragment] + [fragment_sizes[pred_bucket_for_cur_fragment] + + elm_idx - cur_fragment_sz] = read_itr[elm_idx]; + } + + // Update the auxiliary fragment size + fragment_sizes[pred_bucket_for_cur_fragment] += + SECONDARY_FRAGMENT_CAPACITY - cur_fragment_sz; + } + + } else { // The current fragment is to be written in a non-empty + // space, so an incumbent fragment will need to be evicted + + // If the fragment is already within the correct bucket + if (pred_bucket_for_cur_fragment == stored_bucket_idx) { + // If the empty area left in the bucket is not enough for all + // the elements in the fragment. This is needed when the start of + // the bucket might have empty spaces because the writing + // iterator was not aligned with FRAGMENT_CAPACITY (not a + // multiple) + if (bucket_end_offset[stored_bucket_idx] - + bucket_start_off[stored_bucket_idx] < + SECONDARY_FRAGMENT_CAPACITY) { + auto write_itr = + primary_bucket_start + bucket_start_off[stored_bucket_idx]; + auto read_itr = primary_bucket_start + cur_fragment_start_off; + + // Calculate the amount of space left for the current bucket + long remaining_space = bucket_end_offset[stored_bucket_idx] - + bucket_start_off[stored_bucket_idx]; + + // Write out the fragment in the remaining space + std::copy(read_itr, read_itr + remaining_space, write_itr); + + // Update the read iterator to point to the remaining elements + read_itr = primary_bucket_start + cur_fragment_start_off + + remaining_space; + + // Calculate the fragment size + auto cur_fragment_sz = fragment_sizes[stored_bucket_idx]; + + // Write the remaining elements into the auxiliary fragment + // space + for (int k = 0; + k < SECONDARY_FRAGMENT_CAPACITY - remaining_space; ++k) { + fragments[stored_bucket_idx][cur_fragment_sz++] = + *(read_itr++); + } + + // Update the fragment size after the new placements + fragment_sizes[stored_bucket_idx] = cur_fragment_sz; + + // Update the bucket write offset to indicate that the bucket + // was fully written + bucket_start_off[stored_bucket_idx] = + bucket_end_offset[stored_bucket_idx]; + + } else { // The bucket has enough space for the incoming fragment + + auto write_itr = + primary_bucket_start + bucket_start_off[stored_bucket_idx]; + auto read_itr = primary_bucket_start + cur_fragment_start_off; + + if (write_itr != read_itr) { + // Write the elements to the bucket's write offset + std::copy(read_itr, read_itr + SECONDARY_FRAGMENT_CAPACITY, + write_itr); + } + + // Update the write offset for the current bucket + bucket_start_off[stored_bucket_idx] += + SECONDARY_FRAGMENT_CAPACITY; + } + } else { // The fragment is not in the correct bucket and needs to + // be + // swapped out with an incorrectly placed fragment there + + // Predict the bucket of the fragment that will be swapped out + auto first_elm_in_fragment_to_be_swapped_out = + primary_bucket_start + [bucket_start_off[pred_bucket_for_cur_fragment]]; + + long pred_bucket_for_fragment_to_be_swapped_out = + static_cast(std::max( + 0., + std::min( + num_leaf_models - 1., + root_slope * first_elm_in_fragment_to_be_swapped_out + + root_intercept))); + + // Predict the CDF + double pred_cdf = + slopes[pred_bucket_for_fragment_to_be_swapped_out] * + first_elm_in_fragment_to_be_swapped_out + + intercepts[pred_bucket_for_fragment_to_be_swapped_out]; + + // Get the predicted bucket idx + pred_bucket_for_fragment_to_be_swapped_out = static_cast( + std::max(0., std::min(SECONDARY_FANOUT - 1., + (pred_cdf * PRIMARY_FANOUT - + primary_bucket_idx) * + SECONDARY_FANOUT))); + + // If the fragment at the next write offset is not already in the + // right bucket, swap the fragments + if (pred_bucket_for_fragment_to_be_swapped_out != + pred_bucket_for_cur_fragment) { + // Move the contents at the write offset into a swap fragment + auto itr_buf1 = primary_bucket_start + + bucket_start_off[pred_bucket_for_cur_fragment]; + auto itr_buf2 = primary_bucket_start + cur_fragment_start_off; + std::copy(itr_buf1, itr_buf1 + SECONDARY_FRAGMENT_CAPACITY, + swap_buffer); + + // Write the contents of the incoming fragment + std::copy(itr_buf2, itr_buf2 + SECONDARY_FRAGMENT_CAPACITY, + itr_buf1); + + // Place the swap buffer into the emptied space + std::copy(swap_buffer, + swap_buffer + SECONDARY_FRAGMENT_CAPACITY, itr_buf2); + + pred_bucket_for_cur_fragment = + pred_bucket_for_fragment_to_be_swapped_out; + } else { // The fragment at the write offset is already in the + // right bucket + bucket_start_off[pred_bucket_for_cur_fragment] += + SECONDARY_FRAGMENT_CAPACITY; + } + + // Decrement the fragment index so that the newly swapped in + // fragment is not skipped over + --fragment_idx; + } + } + } + + // Add the elements remaining in the auxiliary fragments to the buckets + // they belong to. This is for when the fragments weren't full and thus + // not flushed to the input array + for (long bucket_idx = 0; bucket_idx < SECONDARY_FANOUT; ++bucket_idx) { + // Set the writing offset to the beggining of the bucket + long write_off = bucket_end_offset[bucket_idx - 1]; + if (bucket_idx == 0) { + write_off = 0; + } + + // Add the elements left in the auxiliary fragments to the beggining + // of the predicted bucket, since it was not full + long elm_idx; + for (elm_idx = 0; (elm_idx < fragment_sizes[bucket_idx]) && + (write_off % SECONDARY_FRAGMENT_CAPACITY != 0); + ++elm_idx) { + primary_bucket_start[write_off++] = fragments[bucket_idx][elm_idx]; + } + + // Add the remaining elements from the auxiliary fragments to the end + // of the bucket it belongs to + write_off = + bucket_end_offset[bucket_idx] - fragment_sizes[bucket_idx]; + for (; elm_idx < fragment_sizes[bucket_idx]; ++elm_idx) { + primary_bucket_start[write_off + elm_idx] = + fragments[bucket_idx][elm_idx]; + ++bucket_start_off[bucket_idx]; + } + } + + // Cleanup + delete[] swap_buffer; + delete[] fragments; + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -// + // MODEL-BASED COUNTING SORT // + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -// + // Iterate over the secondary buckets + for (long secondary_bucket_idx = 0; + secondary_bucket_idx < SECONDARY_FANOUT; ++secondary_bucket_idx) { + auto secondary_bucket_sz = + secondary_bucket_sizes[secondary_bucket_idx]; + auto secondary_bucket_start_off = num_elms_finalized; + auto secondary_bucket_end_off = + secondary_bucket_start_off + secondary_bucket_sz; + + // Skip bucket if empty + if (secondary_bucket_sz == 0) continue; + + // Check for homogeneity + bool is_homogeneous = true; + if (rmi.enable_dups_detection) { + for (long elm_idx = secondary_bucket_start_off + 1; + elm_idx < secondary_bucket_end_off; ++elm_idx) { + if (begin[elm_idx] != begin[elm_idx - 1]) { + is_homogeneous = false; + break; + } + } + } + + if (!(rmi.enable_dups_detection and is_homogeneous)) { + long adjustment_offset = + 1. * + (primary_bucket_idx * SECONDARY_FANOUT + secondary_bucket_idx) * + input_sz / (PRIMARY_FANOUT * SECONDARY_FANOUT); + + // Saves the predicted CDFs for the Counting Sort subroutine + vector pred_cache_cs(secondary_bucket_sz); + + // Count array for the model-enhanced counting sort subroutine + vector cnt_hist(secondary_bucket_sz, 0); + + /* + * OPTIMIZATION + * We check to see if the first and last element in the current + * bucket used the same leaf model to obtain their CDF. If that is + * the case, then we don't need to traverse the CDF model for + * every element in this bucket, hence decreasing the inference + * complexity from O(num_layer) to O(1). + */ + + long pred_model_first_elm = static_cast(std::max( + 0., std::min(num_leaf_models - 1., + root_slope * begin[secondary_bucket_start_off] + + root_intercept))); + + long pred_model_last_elm = static_cast(std::max( + 0., std::min(num_leaf_models - 1., + root_slope * begin[secondary_bucket_end_off - 1] + + root_intercept))); + + if (pred_model_first_elm == pred_model_last_elm) { + // Avoid CDF model traversal and predict the CDF only using the + // leaf model + + // Iterate over the elements and place them into the secondary + // buckets + for (long elm_idx = 0; elm_idx < secondary_bucket_sz; ++elm_idx) { + // Find the current element + auto cur_key = begin[secondary_bucket_start_off + elm_idx]; + + // Predict the CDF + double pred_cdf = slopes[pred_model_first_elm] * cur_key + + intercepts[pred_model_first_elm]; + + // Scale the predicted CDF to the input size and save it + pred_cache_cs[elm_idx] = static_cast(std::max( + 0., std::min(secondary_bucket_sz - 1., + (pred_cdf * input_sz) - adjustment_offset))); + + // Update the counts + ++cnt_hist[pred_cache_cs[elm_idx]]; + } + } else { + // Fully traverse the CDF model again to predict the CDF of the + // current element + + // Iterate over the elements and place them into the minor + // buckets + for (long elm_idx = 0; elm_idx < secondary_bucket_sz; ++elm_idx) { + // Find the current element + auto cur_key = begin[secondary_bucket_start_off + elm_idx]; + + // Predict the model idx in the leaf layer + auto model_idx_next_layer = static_cast(std::max( + 0., std::min(num_leaf_models - 1., + root_slope * cur_key + root_intercept))); + // Predict the CDF + double pred_cdf = slopes[model_idx_next_layer] * cur_key + + intercepts[model_idx_next_layer]; + + // Scale the predicted CDF to the input size and save it + pred_cache_cs[elm_idx] = static_cast(std::max( + 0., std::min(secondary_bucket_sz - 1., + (pred_cdf * input_sz) - adjustment_offset))); + + // Update the counts + ++cnt_hist[pred_cache_cs[elm_idx]]; + } + } + + --cnt_hist[0]; + + // Calculate the running totals + for (long i = 1; i < secondary_bucket_sz; ++i) { + cnt_hist[i] += cnt_hist[i - 1]; + } + + // Allocate a temporary buffer for placing the keys in sorted + // order + vector tmp(secondary_bucket_sz); + + // Re-shuffle the elms based on the calculated cumulative counts + for (long elm_idx = 0; elm_idx < secondary_bucket_sz; ++elm_idx) { + // Place the element in the predicted position in the array + + tmp[cnt_hist[pred_cache_cs[elm_idx]]] = + begin[secondary_bucket_start_off + elm_idx]; + + // Update counts + --cnt_hist[pred_cache_cs[elm_idx]]; + } + + // Write back the temprorary buffer to the original input + std::copy(tmp.begin(), tmp.end(), + begin + secondary_bucket_start_off); + } + // Update the number of finalized elements + num_elms_finalized += secondary_bucket_sz; + } // end of iteration over the secondary buckets + + } // end of processing for non-flagged, non-homogeneous primary buckets + } // end of iteration over primary buckets + } + // clock_gettime(CLOCK_REALTIME, &ts2); + // printf("SECOND ROUND OF PARTITIONING %ld us\n", + // ToNanoseconds(SubtractTime(ts2, ts1)) / 1000); + + // Touch up + // clock_gettime(CLOCK_REALTIME, &ts1); + learned_sort::utils::insertion_sort(begin, end); + // clock_gettime(CLOCK_REALTIME, &ts2); + // printf("INSERTION SORT %ld us\n", + // ToNanoseconds(SubtractTime(ts2, ts1)) / 1000); +} + +/** + * @brief Sorts a sequence of numerical keys from [begin, end) using Learned + * Sort, in ascending order. + * + * @tparam RandomIt A bi-directional random iterator over the sequence of keys + * @param begin Random-access iterators to the initial position of the + * sequence to be used for sorting. The range used is [begin,end), which + * contains all the elements between first and last, including the element + * pointed by first but not the element pointed by last. + * @param end Random-access iterators to the last position of the sequence to + * be used for sorting. The range used is [begin,end), which contains all the + * elements between first and last, including the element pointed by first but + * not the element pointed by last. + * @param params The hyperparameters for the CDF model, which describe the + * architecture and sampling ratio. + */ +template +void sort( + RandomIt begin, RandomIt end, + typename TwoLayerRMI::value_type>::Params + ¶ms) { + // Check if the data is already sorted + // struct timespec ts1, ts2; + // clock_gettime(CLOCK_REALTIME, &ts1); + if (*(end - 1) >= *begin && std::is_sorted(begin, end)) { + return; + } + // clock_gettime(CLOCK_REALTIME, &ts2); + // printf("IS SORTED %ld us\n", ToNanoseconds(SubtractTime(ts2, ts1)) / 1000); + + // Check if the data is sorted in descending order + // clock_gettime(CLOCK_REALTIME, &ts1); + if (*(end - 1) <= *begin) { + auto is_reverse_sorted = true; + + for (auto i = begin; i != end - 1; ++i) { + if (*i < *(i + 1)) { + is_reverse_sorted = false; + } + } + + if (is_reverse_sorted) { + std::reverse(begin, end); + return; + } + } + // clock_gettime(CLOCK_REALTIME, &ts2); + // printf("CHECK ORDER %ld us\n", ToNanoseconds(SubtractTime(ts2, ts1)) / 1000); + + if (std::distance(begin, end) <= + std::max(params.fanout * params.threshold, + 5 * params.num_leaf_models)) { + std::sort(begin, end); + } else { + // Initialize the RMI + TwoLayerRMI::value_type> rmi(params); + + // Check if the model can be trained + // clock_gettime(CLOCK_REALTIME, &ts1); + if (rmi.train(begin, end)) { + // clock_gettime(CLOCK_REALTIME, &ts2); + // printf("TRAIN %ld us\n", ToNanoseconds(SubtractTime(ts2, ts1)) / 1000); + // Sort the data if the model was successfully trained + learned_sort::sort(begin, end, rmi); + } + + else { // Fall back in case the model could not be trained + std::sort(begin, end); + } + } +} + +/** + * @brief Sorts a sequence of numerical keys from [begin, end) using Learned + * Sort, in ascending order. + * + * @tparam RandomIt A bi-directional random iterator over the sequence of keys + * @param begin Random-access iterators to the initial position of the + * sequence to be used for sorting. The range used is [begin,end), which + * contains all the elements between first and last, including the element + * pointed by first but not the element pointed by last. + * @param end Random-access iterators to the last position of the sequence to + * be used for sorting. The range used is [begin,end), which contains all the + * elements between first and last, including the element pointed by first but + * not the element pointed by last. + */ +template +void sort(RandomIt begin, RandomIt end) { + if (begin != end) { + typename TwoLayerRMI::value_type>::Params + p; + learned_sort::sort(begin, end, p); + } +} + +} // namespace learned_sort + +#endif /* __cplusplus */ +COSMOPOLITAN_C_START_ + +void learned_sort_int64(int64_t *, size_t); + +COSMOPOLITAN_C_END_ +#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_H_ */ diff --git a/third_party/libcxx/libcxx.mk b/third_party/libcxx/libcxx.mk index ff1ab6b55..c6103d788 100644 --- a/third_party/libcxx/libcxx.mk +++ b/third_party/libcxx/libcxx.mk @@ -8,6 +8,11 @@ THIRD_PARTY_LIBCXX = $(THIRD_PARTY_LIBCXX_A_DEPS) $(THIRD_PARTY_LIBCXX_A) THIRD_PARTY_LIBCXX_A = o/$(MODE)/third_party/libcxx/libcxx.a THIRD_PARTY_LIBCXX_A_HDRS = \ + third_party/libcxx/rmi.h \ + third_party/libcxx/utils.h \ + third_party/libcxx/learned_sort.h \ + third_party/libcxx/pdqsort.h \ + third_party/libcxx/ska_sort.h \ third_party/libcxx/__bit_reference \ third_party/libcxx/__bsd_locale_fallbacks.h \ third_party/libcxx/__config \ @@ -138,6 +143,9 @@ THIRD_PARTY_LIBCXX_A_HDRS = \ third_party/libcxx/wctype.h THIRD_PARTY_LIBCXX_A_SRCS_CC = \ + third_party/libcxx/learned_sort.cc \ + third_party/libcxx/pdqsort.cc \ + third_party/libcxx/ska_sort.cc \ third_party/libcxx/algorithm.cc \ third_party/libcxx/charconv.cc \ third_party/libcxx/chrono.cc \ diff --git a/third_party/libcxx/pdqsort.cc b/third_party/libcxx/pdqsort.cc new file mode 100644 index 000000000..c2bc41fc3 --- /dev/null +++ b/third_party/libcxx/pdqsort.cc @@ -0,0 +1,25 @@ +/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│ +│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2023 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "third_party/libcxx/pdqsort.h" + +void pdqsort_int64(int64_t* A, size_t n) { pdqsort(A, A + n); } + +void pdqsort_branchless_int64(int64_t* A, size_t n) { + pdqsort_branchless(A, A + n); +} diff --git a/third_party/libcxx/pdqsort.h b/third_party/libcxx/pdqsort.h new file mode 100644 index 000000000..169e49dc2 --- /dev/null +++ b/third_party/libcxx/pdqsort.h @@ -0,0 +1,552 @@ +/* -*- c++ -*- */ +/* clang-format off */ +/* + pdqsort.h - Pattern-defeating quicksort. + + Copyright (c) 2015 Orson Peters + + This software is provided 'as-is', without any express or implied warranty. In no event will the + authors be held liable for any damages arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, including commercial + applications, and to alter it and redistribute it freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not claim that you wrote the + original software. If you use this software in a product, an acknowledgment in the product + documentation would be appreciated but is not required. + + 2. Altered source versions must be plainly marked as such, and must not be misrepresented as + being the original software. + + 3. This notice may not be removed or altered from any source distribution. +*/ + + +#ifndef PDQSORT_H +#define PDQSORT_H +#ifdef __cplusplus +#include "third_party/libcxx/algorithm" +#include "third_party/libcxx/cstddef" +#include "third_party/libcxx/functional" +#include "third_party/libcxx/utility" +#include "third_party/libcxx/iterator" + +#if __cplusplus >= 201103L + #include "third_party/libcxx/cstdint" + #include "third_party/libcxx/type_traits" + #define PDQSORT_PREFER_MOVE(x) std::move(x) +#else + #define PDQSORT_PREFER_MOVE(x) (x) +#endif + + +namespace pdqsort_detail { + enum { + // Partitions below this size are sorted using insertion sort. + insertion_sort_threshold = 24, + + // Partitions above this size use Tukey's ninther to select the pivot. + ninther_threshold = 128, + + // When we detect an already sorted partition, attempt an insertion sort that allows this + // amount of element moves before giving up. + partial_insertion_sort_limit = 8, + + // Must be multiple of 8 due to loop unrolling, and < 256 to fit in unsigned char. + block_size = 64, + + // Cacheline size, assumes power of two. + cacheline_size = 64 + + }; + +#if __cplusplus >= 201103L + template struct is_default_compare : std::false_type { }; + template struct is_default_compare > : std::true_type { }; + template struct is_default_compare > : std::true_type { }; +#endif + + // Returns floor(log2(n)), assumes n > 0. + template + inline int log2(T n) { + int log = 0; + while (n >>= 1) ++log; + return log; + } + + // Sorts [begin, end) using insertion sort with the given comparison function. + template + inline void insertion_sort(Iter begin, Iter end, Compare comp) { + typedef typename std::iterator_traits::value_type T; + if (begin == end) return; + + for (Iter cur = begin + 1; cur != end; ++cur) { + Iter sift = cur; + Iter sift_1 = cur - 1; + + // Compare first so we can avoid 2 moves for an element already positioned correctly. + if (comp(*sift, *sift_1)) { + T tmp = PDQSORT_PREFER_MOVE(*sift); + + do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); } + while (sift != begin && comp(tmp, *--sift_1)); + + *sift = PDQSORT_PREFER_MOVE(tmp); + } + } + } + + // Sorts [begin, end) using insertion sort with the given comparison function. Assumes + // *(begin - 1) is an element smaller than or equal to any element in [begin, end). + template + inline void unguarded_insertion_sort(Iter begin, Iter end, Compare comp) { + typedef typename std::iterator_traits::value_type T; + if (begin == end) return; + + for (Iter cur = begin + 1; cur != end; ++cur) { + Iter sift = cur; + Iter sift_1 = cur - 1; + + // Compare first so we can avoid 2 moves for an element already positioned correctly. + if (comp(*sift, *sift_1)) { + T tmp = PDQSORT_PREFER_MOVE(*sift); + + do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); } + while (comp(tmp, *--sift_1)); + + *sift = PDQSORT_PREFER_MOVE(tmp); + } + } + } + + // Attempts to use insertion sort on [begin, end). Will return false if more than + // partial_insertion_sort_limit elements were moved, and abort sorting. Otherwise it will + // successfully sort and return true. + template + inline bool partial_insertion_sort(Iter begin, Iter end, Compare comp) { + typedef typename std::iterator_traits::value_type T; + if (begin == end) return true; + + std::size_t limit = 0; + for (Iter cur = begin + 1; cur != end; ++cur) { + Iter sift = cur; + Iter sift_1 = cur - 1; + + // Compare first so we can avoid 2 moves for an element already positioned correctly. + if (comp(*sift, *sift_1)) { + T tmp = PDQSORT_PREFER_MOVE(*sift); + + do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); } + while (sift != begin && comp(tmp, *--sift_1)); + + *sift = PDQSORT_PREFER_MOVE(tmp); + limit += cur - sift; + } + + if (limit > partial_insertion_sort_limit) return false; + } + + return true; + } + + template + inline void sort2(Iter a, Iter b, Compare comp) { + if (comp(*b, *a)) std::iter_swap(a, b); + } + + // Sorts the elements *a, *b and *c using comparison function comp. + template + inline void sort3(Iter a, Iter b, Iter c, Compare comp) { + sort2(a, b, comp); + sort2(b, c, comp); + sort2(a, b, comp); + } + + template + inline T* align_cacheline(T* p) { +#if defined(UINTPTR_MAX) && __cplusplus >= 201103L + std::uintptr_t ip = reinterpret_cast(p); +#else + std::size_t ip = reinterpret_cast(p); +#endif + ip = (ip + cacheline_size - 1) & -cacheline_size; + return reinterpret_cast(ip); + } + + template + inline void swap_offsets(Iter first, Iter last, + unsigned char* offsets_l, unsigned char* offsets_r, + int num, bool use_swaps) { + typedef typename std::iterator_traits::value_type T; + if (use_swaps) { + // This case is needed for the descending distribution, where we need + // to have proper swapping for pdqsort to remain O(n). + for (int i = 0; i < num; ++i) { + std::iter_swap(first + offsets_l[i], last - offsets_r[i]); + } + } else if (num > 0) { + Iter l = first + offsets_l[0]; Iter r = last - offsets_r[0]; + T tmp(PDQSORT_PREFER_MOVE(*l)); *l = PDQSORT_PREFER_MOVE(*r); + for (int i = 1; i < num; ++i) { + l = first + offsets_l[i]; *r = PDQSORT_PREFER_MOVE(*l); + r = last - offsets_r[i]; *l = PDQSORT_PREFER_MOVE(*r); + } + *r = PDQSORT_PREFER_MOVE(tmp); + } + } + + // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal + // to the pivot are put in the right-hand partition. Returns the position of the pivot after + // partitioning and whether the passed sequence already was correctly partitioned. Assumes the + // pivot is a median of at least 3 elements and that [begin, end) is at least + // insertion_sort_threshold long. Uses branchless partitioning. + template + inline std::pair partition_right_branchless(Iter begin, Iter end, Compare comp) { + typedef typename std::iterator_traits::value_type T; + + // Move pivot into local for speed. + T pivot(PDQSORT_PREFER_MOVE(*begin)); + Iter first = begin; + Iter last = end; + + // Find the first element greater than or equal than the pivot (the median of 3 guarantees + // this exists). + while (comp(*++first, pivot)); + + // Find the first element strictly smaller than the pivot. We have to guard this search if + // there was no element before *first. + if (first - 1 == begin) while (first < last && !comp(*--last, pivot)); + else while ( !comp(*--last, pivot)); + + // If the first pair of elements that should be swapped to partition are the same element, + // the passed in sequence already was correctly partitioned. + bool already_partitioned = first >= last; + if (!already_partitioned) { + std::iter_swap(first, last); + ++first; + } + + // The following branchless partitioning is derived from "BlockQuicksort: How Branch + // Mispredictions don’t affect Quicksort" by Stefan Edelkamp and Armin Weiss. + unsigned char offsets_l_storage[block_size + cacheline_size]; + unsigned char offsets_r_storage[block_size + cacheline_size]; + unsigned char* offsets_l = align_cacheline(offsets_l_storage); + unsigned char* offsets_r = align_cacheline(offsets_r_storage); + int num_l, num_r, start_l, start_r; + num_l = num_r = start_l = start_r = 0; + + while (last - first > 2 * block_size) { + // Fill up offset blocks with elements that are on the wrong side. + if (num_l == 0) { + start_l = 0; + Iter it = first; + for (unsigned char i = 0; i < block_size;) { + offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; + offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; + offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; + offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; + offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; + offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; + offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; + offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; + } + } + if (num_r == 0) { + start_r = 0; + Iter it = last; + for (unsigned char i = 0; i < block_size;) { + offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); + } + } + + // Swap elements and update block sizes and first/last boundaries. + int num = std::min(num_l, num_r); + swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r, + num, num_l == num_r); + num_l -= num; num_r -= num; + start_l += num; start_r += num; + if (num_l == 0) first += block_size; + if (num_r == 0) last -= block_size; + } + + int l_size = 0, r_size = 0; + int unknown_left = (int)(last - first) - ((num_r || num_l) ? block_size : 0); + if (num_r) { + // Handle leftover block by assigning the unknown elements to the other block. + l_size = unknown_left; + r_size = block_size; + } else if (num_l) { + l_size = block_size; + r_size = unknown_left; + } else { + // No leftover block, split the unknown elements in two blocks. + l_size = unknown_left/2; + r_size = unknown_left - l_size; + } + + // Fill offset buffers if needed. + if (unknown_left && !num_l) { + start_l = 0; + Iter it = first; + for (unsigned char i = 0; i < l_size;) { + offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; + } + } + if (unknown_left && !num_r) { + start_r = 0; + Iter it = last; + for (unsigned char i = 0; i < r_size;) { + offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); + } + } + + int num = std::min(num_l, num_r); + swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r, num, num_l == num_r); + num_l -= num; num_r -= num; + start_l += num; start_r += num; + if (num_l == 0) first += l_size; + if (num_r == 0) last -= r_size; + + // We have now fully identified [first, last)'s proper position. Swap the last elements. + if (num_l) { + offsets_l += start_l; + while (num_l--) std::iter_swap(first + offsets_l[num_l], --last); + first = last; + } + if (num_r) { + offsets_r += start_r; + while (num_r--) std::iter_swap(last - offsets_r[num_r], first), ++first; + last = first; + } + + // Put the pivot in the right place. + Iter pivot_pos = first - 1; + *begin = PDQSORT_PREFER_MOVE(*pivot_pos); + *pivot_pos = PDQSORT_PREFER_MOVE(pivot); + + return std::make_pair(pivot_pos, already_partitioned); + } + + // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal + // to the pivot are put in the right-hand partition. Returns the position of the pivot after + // partitioning and whether the passed sequence already was correctly partitioned. Assumes the + // pivot is a median of at least 3 elements and that [begin, end) is at least + // insertion_sort_threshold long. + template + inline std::pair partition_right(Iter begin, Iter end, Compare comp) { + typedef typename std::iterator_traits::value_type T; + + // Move pivot into local for speed. + T pivot(PDQSORT_PREFER_MOVE(*begin)); + + Iter first = begin; + Iter last = end; + + // Find the first element greater than or equal than the pivot (the median of 3 guarantees + // this exists). + while (comp(*++first, pivot)); + + // Find the first element strictly smaller than the pivot. We have to guard this search if + // there was no element before *first. + if (first - 1 == begin) while (first < last && !comp(*--last, pivot)); + else while ( !comp(*--last, pivot)); + + // If the first pair of elements that should be swapped to partition are the same element, + // the passed in sequence already was correctly partitioned. + bool already_partitioned = first >= last; + + // Keep swapping pairs of elements that are on the wrong side of the pivot. Previously + // swapped pairs guard the searches, which is why the first iteration is special-cased + // above. + while (first < last) { + std::iter_swap(first, last); + while (comp(*++first, pivot)); + while (!comp(*--last, pivot)); + } + + // Put the pivot in the right place. + Iter pivot_pos = first - 1; + *begin = PDQSORT_PREFER_MOVE(*pivot_pos); + *pivot_pos = PDQSORT_PREFER_MOVE(pivot); + + return std::make_pair(pivot_pos, already_partitioned); + } + + // Similar function to the one above, except elements equal to the pivot are put to the left of + // the pivot and it doesn't check or return if the passed sequence already was partitioned. + // Since this is rarely used (the many equal case), and in that case pdqsort already has O(n) + // performance, no block quicksort is applied here for simplicity. + template + inline Iter partition_left(Iter begin, Iter end, Compare comp) { + typedef typename std::iterator_traits::value_type T; + + T pivot(PDQSORT_PREFER_MOVE(*begin)); + Iter first = begin; + Iter last = end; + + while (comp(pivot, *--last)); + + if (last + 1 == end) while (first < last && !comp(pivot, *++first)); + else while ( !comp(pivot, *++first)); + + while (first < last) { + std::iter_swap(first, last); + while (comp(pivot, *--last)); + while (!comp(pivot, *++first)); + } + + Iter pivot_pos = last; + *begin = PDQSORT_PREFER_MOVE(*pivot_pos); + *pivot_pos = PDQSORT_PREFER_MOVE(pivot); + + return pivot_pos; + } + + + template + inline void pdqsort_loop(Iter begin, Iter end, Compare comp, int bad_allowed, bool leftmost = true) { + typedef typename std::iterator_traits::difference_type diff_t; + + // Use a while loop for tail recursion elimination. + while (true) { + diff_t size = end - begin; + + // Insertion sort is faster for small arrays. + if (size < insertion_sort_threshold) { + if (leftmost) insertion_sort(begin, end, comp); + else unguarded_insertion_sort(begin, end, comp); + return; + } + + // Choose pivot as median of 3 or pseudomedian of 9. + diff_t s2 = size / 2; + if (size > ninther_threshold) { + sort3(begin, begin + s2, end - 1, comp); + sort3(begin + 1, begin + (s2 - 1), end - 2, comp); + sort3(begin + 2, begin + (s2 + 1), end - 3, comp); + sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp); + std::iter_swap(begin, begin + s2); + } else sort3(begin + s2, begin, end - 1, comp); + + // If *(begin - 1) is the end of the right partition of a previous partition operation + // there is no element in [begin, end) that is smaller than *(begin - 1). Then if our + // pivot compares equal to *(begin - 1) we change strategy, putting equal elements in + // the left partition, greater elements in the right partition. We do not have to + // recurse on the left partition, since it's sorted (all equal). + if (!leftmost && !comp(*(begin - 1), *begin)) { + begin = partition_left(begin, end, comp) + 1; + continue; + } + + // Partition and get results. + std::pair part_result = + Branchless ? partition_right_branchless(begin, end, comp) + : partition_right(begin, end, comp); + Iter pivot_pos = part_result.first; + bool already_partitioned = part_result.second; + + // Check for a highly unbalanced partition. + diff_t l_size = pivot_pos - begin; + diff_t r_size = end - (pivot_pos + 1); + bool highly_unbalanced = l_size < size / 8 || r_size < size / 8; + + // If we got a highly unbalanced partition we shuffle elements to break many patterns. + if (highly_unbalanced) { + // If we had too many bad partitions, switch to heapsort to guarantee O(n log n). + if (--bad_allowed == 0) { + std::make_heap(begin, end, comp); + std::sort_heap(begin, end, comp); + return; + } + + if (l_size >= insertion_sort_threshold) { + std::iter_swap(begin, begin + l_size / 4); + std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4); + + if (l_size > ninther_threshold) { + std::iter_swap(begin + 1, begin + (l_size / 4 + 1)); + std::iter_swap(begin + 2, begin + (l_size / 4 + 2)); + std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1)); + std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2)); + } + } + + if (r_size >= insertion_sort_threshold) { + std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4)); + std::iter_swap(end - 1, end - r_size / 4); + + if (r_size > ninther_threshold) { + std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4)); + std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4)); + std::iter_swap(end - 2, end - (1 + r_size / 4)); + std::iter_swap(end - 3, end - (2 + r_size / 4)); + } + } + } else { + // If we were decently balanced and we tried to sort an already partitioned + // sequence try to use insertion sort. + if (already_partitioned && partial_insertion_sort(begin, pivot_pos, comp) + && partial_insertion_sort(pivot_pos + 1, end, comp)) return; + } + + // Sort the left partition first using recursion and do tail recursion elimination for + // the right-hand partition. + pdqsort_loop(begin, pivot_pos, comp, bad_allowed, leftmost); + begin = pivot_pos + 1; + leftmost = false; + } + } +} + + +template +inline void pdqsort(Iter begin, Iter end, Compare comp) { + if (begin == end) return; + +#if __cplusplus >= 201103L + pdqsort_detail::pdqsort_loop::type>::value && + std::is_arithmetic::value_type>::value>( + begin, end, comp, pdqsort_detail::log2(end - begin)); +#else + pdqsort_detail::pdqsort_loop( + begin, end, comp, pdqsort_detail::log2(end - begin)); +#endif +} + +template +inline void pdqsort(Iter begin, Iter end) { + typedef typename std::iterator_traits::value_type T; + pdqsort(begin, end, std::less()); +} + +template +inline void pdqsort_branchless(Iter begin, Iter end, Compare comp) { + if (begin == end) return; + pdqsort_detail::pdqsort_loop( + begin, end, comp, pdqsort_detail::log2(end - begin)); +} + +template +inline void pdqsort_branchless(Iter begin, Iter end) { + typedef typename std::iterator_traits::value_type T; + pdqsort_branchless(begin, end, std::less()); +} + + +#undef PDQSORT_PREFER_MOVE +#endif /* __cplusplus */ +COSMOPOLITAN_C_START_ + +void pdqsort_int64(int64_t *, size_t); +void pdqsort_branchless_int64(int64_t *, size_t); + +COSMOPOLITAN_C_END_ +#endif /* PDQSORT_H */ diff --git a/third_party/libcxx/rmi.h b/third_party/libcxx/rmi.h new file mode 100644 index 000000000..8e389dc5f --- /dev/null +++ b/third_party/libcxx/rmi.h @@ -0,0 +1,308 @@ +// -*- c++ -*- +// clang-format off +#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_ +#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_ +#include "third_party/libcxx/iostream" +#include "third_party/libcxx/vector" + +using namespace std; + +namespace learned_sort { + +// Packs the key and its respective scaled CDF value +template +struct training_point { + T x; + double y; +}; + +// Represents linear models +struct linear_model { + double slope = 0; + double intercept = 0; +}; + +// An implementation of a 2-layer RMI model +template +class TwoLayerRMI { + public: + // CDF model hyperparameters + struct Params { + // Member fields + long fanout; + float sampling_rate; + long threshold; + long num_leaf_models; + + // Default hyperparameters + static constexpr long DEFAULT_FANOUT = 1e3; + static constexpr float DEFAULT_SAMPLING_RATE = .01; + static constexpr long DEFAULT_THRESHOLD = 100; + static constexpr long DEFAULT_NUM_LEAF_MODELS = 1000; + static constexpr long MIN_SORTING_SIZE = 1e4; + + // Default constructor + Params() { + this->fanout = DEFAULT_FANOUT; + this->sampling_rate = DEFAULT_SAMPLING_RATE; + this->threshold = DEFAULT_THRESHOLD; + this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS; + } + + // Constructor with custom hyperparameter values + Params(float sampling_rate, float overallocation, long fanout, + long batch_sz, long threshold) { + this->fanout = fanout; + this->sampling_rate = sampling_rate; + this->threshold = threshold; + this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS; + } + }; + + // Member variables of the CDF model + bool trained; + linear_model root_model; + vector leaf_models; + vector training_sample; + Params hp; + bool enable_dups_detection; + + // CDF model constructor + TwoLayerRMI(Params p) { + this->trained = false; + this->hp = p; + this->leaf_models.resize(p.num_leaf_models); + this->enable_dups_detection = true; + } + + // Pretty-printing function + void print() { + printf("[0][0]: slope=%0.5f; intercept=%0.5f;\n", root_model.slope, + root_model.intercept); + for (int model_idx = 0; model_idx < hp.num_leaf_models; ++model_idx) { + printf("[%i][1]: slope=%0.5f; intercept=%0.5f;\n", model_idx, + leaf_models[model_idx].slope, leaf_models[model_idx].intercept); + } + cout << "-----------------------------" << endl; + } + + /** + * @brief Train a CDF function with an RMI architecture, using linear spline + * interpolation. + * + * @param begin Random-access iterators to the initial position of the + * sequence to be used for sorting. The range used is [begin,end), which + * contains all the elements between first and last, including the element + * pointed by first but not the element pointed by last. + * @param end Random-access iterators to the last position of the sequence to + * be used for sorting. The range used is [begin,end), which contains all the + * elements between first and last, including the element pointed by first but + * not the element pointed by last. + * @return true if the model was trained successfully, false otherwise. + */ + bool train(typename vector::iterator begin, + typename vector::iterator end) { + // Determine input size + const long INPUT_SZ = std::distance(begin, end); + + // Validate parameters + if (this->hp.fanout >= INPUT_SZ) { + this->hp.fanout = TwoLayerRMI::Params::DEFAULT_FANOUT; + cerr << "\33[93;1mWARNING\33[0m: Invalid fanout. Using default (" + << TwoLayerRMI::Params::DEFAULT_FANOUT << ")." << endl; + } + + if (this->hp.sampling_rate <= 0 or this->hp.sampling_rate > 1) { + this->hp.sampling_rate = TwoLayerRMI::Params::DEFAULT_SAMPLING_RATE; + cerr << "\33[93;1mWARNING\33[0m: Invalid sampling rate. Using default (" + << TwoLayerRMI::Params::DEFAULT_SAMPLING_RATE << ")." << endl; + } + + if (this->hp.threshold <= 0 or this->hp.threshold >= INPUT_SZ or + this->hp.threshold >= INPUT_SZ / this->hp.fanout) { + this->hp.threshold = TwoLayerRMI::Params::DEFAULT_THRESHOLD; + cerr << "\33[93;1mWARNING\33[0m: Invalid threshold. Using default (" + << TwoLayerRMI::Params::DEFAULT_THRESHOLD << ")." << endl; + } + + // Initialize the CDF model + static const long NUM_LAYERS = 2; + vector > > > training_data(NUM_LAYERS); + for (long layer_idx = 0; layer_idx < NUM_LAYERS; ++layer_idx) { + training_data[layer_idx].resize(hp.num_leaf_models); + } + + //----------------------------------------------------------// + // SAMPLE // + //----------------------------------------------------------// + + // Determine sample size + const long SAMPLE_SZ = std::min( + INPUT_SZ, std::max(this->hp.sampling_rate * INPUT_SZ, + TwoLayerRMI::Params::MIN_SORTING_SIZE)); + + // Create a sample array + this->training_sample.reserve(SAMPLE_SZ); + + // Start sampling + long offset = static_cast(1. * INPUT_SZ / SAMPLE_SZ); + for (auto i = begin; i < end; i += offset) { + // NOTE: We don't directly assign SAMPLE_SZ to this->training_sample_sz + // to avoid issues with divisibility + this->training_sample.push_back(*i); + } + + // Sort the sample using the provided comparison function + std::sort(this->training_sample.begin(), this->training_sample.end()); + + // Count the number of unique keys + auto sample_cpy = this->training_sample; + long num_unique_elms = std::distance( + sample_cpy.begin(), std::unique(sample_cpy.begin(), sample_cpy.end())); + + // Stop early if the array has very few unique values. We need at least 2 + // unique training examples per leaf model. + if (num_unique_elms < 2 * this->hp.num_leaf_models) { + return false; + } else if (num_unique_elms > .9 * training_sample.size()) { + this->enable_dups_detection = false; + } + + //----------------------------------------------------------// + // TRAIN THE MODELS // + //----------------------------------------------------------// + + // Populate the training data for the root model + for (long i = 0; i < SAMPLE_SZ; ++i) { + training_data[0][0].push_back( + {this->training_sample[i], 1. * i / SAMPLE_SZ}); + } + + // Train the root model using linear interpolation + auto *current_training_data = &training_data[0][0]; + linear_model *current_model = &(this->root_model); + + // Find the min and max values in the training set + training_point min = current_training_data->front(); + training_point max = current_training_data->back(); + + // Calculate the slope and intercept terms, assuming min.y = 0 and max.y + current_model->slope = 1. / (max.x - min.x); + current_model->intercept = -current_model->slope * min.x; + + // Extrapolate for the number of models in the next layer + current_model->slope *= this->hp.num_leaf_models - 1; + current_model->intercept *= this->hp.num_leaf_models - 1; + + // Populate the training data for the next layer + for (const auto &d : *current_training_data) { + // Predict the model index in next layer + long rank = current_model->slope * d.x + current_model->intercept; + + // Normalize the rank between 0 and the number of models in the next layer + rank = std::max(static_cast(0), + std::min(this->hp.num_leaf_models - 1, rank)); + + // Place the data in the predicted training bucket + training_data[1][rank].push_back(d); + } + + // Train the leaf models + for (long model_idx = 0; model_idx < this->hp.num_leaf_models; + ++model_idx) { + // Update iterator variables + current_training_data = &training_data[1][model_idx]; + current_model = &(this->leaf_models[model_idx]); + + // Interpolate the min points in the training buckets + if (model_idx == 0) { + // The current model is the first model in the current layer + + if (current_training_data->size() < 2) { + // Case 1: The first model in this layer is empty + current_model->slope = 0; + current_model->intercept = 0; + + // Insert a fictive training point to avoid propagating more than one + // empty initial models. + training_point tp; + tp.x = 0; + tp.y = 0; + current_training_data->push_back(tp); + } else { + // Case 2: The first model in this layer is not empty + + min = current_training_data->front(); + max = current_training_data->back(); + + // Hallucinating as if min.y = 0 + current_model->slope = (1. * max.y) / (max.x - min.x); + current_model->intercept = min.y - current_model->slope * min.x; + } + } else if (model_idx == this->hp.num_leaf_models - 1) { + if (current_training_data->empty()) { + // Case 3: The final model in this layer is empty + + current_model->slope = 0; + current_model->intercept = 1; + } else { + // Case 4: The last model in this layer is not empty + + min = training_data[1][model_idx - 1].back(); + max = current_training_data->back(); + + // Hallucinating as if max.y = 1 + current_model->slope = (1. - min.y) / (max.x - min.x); + current_model->intercept = min.y - current_model->slope * min.x; + } + } else { + // The current model is not the first model in the current layer + + if (current_training_data->empty()) { + // Case 5: The intermediate model in this layer is empty + current_model->slope = 0; + current_model->intercept = training_data[1][model_idx - 1] + .back() + .y; // If the previous model + // was empty too, it will + // use the fictive + // training points + + // Insert a fictive training point to avoid propagating more than one + // empty initial models. + // NOTE: This will _NOT_ throw to DIV/0 due to identical x's and y's + // because it is working backwards. + training_point tp; + tp.x = training_data[1][model_idx - 1].back().x; + tp.y = training_data[1][model_idx - 1].back().y; + current_training_data->push_back(tp); + } else { + // Case 6: The intermediate leaf model is not empty + + min = training_data[1][model_idx - 1].back(); + max = current_training_data->back(); + + current_model->slope = (max.y - min.y) / (max.x - min.x); + current_model->intercept = min.y - current_model->slope * min.x; + } + } + } + + // NOTE: + // The last stage (layer) of this model contains weights that predict the + // CDF of the keys (i.e. Range is [0-1]) When using this model to predict + // the position of the keys in the sorted order, you MUST scale the weights + // of the last layer to whatever range you are predicting for. The inner + // layers of the model have already been extrapolated to the length of the + // stage.git + // + // This is a design choice to help with the portability of the model. + // + this->trained = true; + + return true; + } +}; +} // namespace learned_sort + +#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_ */ diff --git a/third_party/libcxx/ska_sort.cc b/third_party/libcxx/ska_sort.cc new file mode 100644 index 000000000..43b70a7d2 --- /dev/null +++ b/third_party/libcxx/ska_sort.cc @@ -0,0 +1,22 @@ +/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│ +│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2023 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "third_party/libcxx/ska_sort.h" +#include "third_party/libcxx/ska_sort.h" + +void ska_sort_int64(int64_t* A, size_t n) { ska_sort(A, A + n); } diff --git a/third_party/libcxx/ska_sort.h b/third_party/libcxx/ska_sort.h new file mode 100644 index 000000000..7b808bd4b --- /dev/null +++ b/third_party/libcxx/ska_sort.h @@ -0,0 +1,1457 @@ +// -*- c++ -*- +// clang-format off +// +// Copyright Malte Skarupke 2016. +// Distributed under the Boost Software License, Version 1.0. +// (See http://www.boost.org/LICENSE_1_0.txt) +// +#ifndef COSMOPOLITAN_LIBC_MEM_SKA_SORT_H_ +#define COSMOPOLITAN_LIBC_MEM_SKA_SORT_H_ +#ifdef __cplusplus +#include "third_party/libcxx/algorithm" +#include "third_party/libcxx/cstdint" +#include "third_party/libcxx/tuple" +#include "third_party/libcxx/type_traits" +#include "third_party/libcxx/utility" + +namespace detail +{ +template +void counting_sort_impl(It begin, It end, OutIt out_begin, ExtractKey && extract_key) +{ + count_type counts[256] = {}; + for (It it = begin; it != end; ++it) + { + ++counts[extract_key(*it)]; + } + count_type total = 0; + for (count_type & count : counts) + { + count_type old_count = count; + count = total; + total += old_count; + } + for (; begin != end; ++begin) + { + std::uint8_t key = extract_key(*begin); + out_begin[counts[key]++] = std::move(*begin); + } +} +template +void counting_sort_impl(It begin, It end, OutIt out_begin, ExtractKey && extract_key) +{ + counting_sort_impl(begin, end, out_begin, extract_key); +} +inline bool to_unsigned_or_bool(bool b) +{ + return b; +} +inline unsigned char to_unsigned_or_bool(unsigned char c) +{ + return c; +} +inline unsigned char to_unsigned_or_bool(signed char c) +{ + return static_cast(c) + 128; +} +inline unsigned char to_unsigned_or_bool(char c) +{ + return static_cast(c); +} +inline std::uint16_t to_unsigned_or_bool(char16_t c) +{ + return static_cast(c); +} +inline std::uint32_t to_unsigned_or_bool(char32_t c) +{ + return static_cast(c); +} +inline std::uint32_t to_unsigned_or_bool(wchar_t c) +{ + return static_cast(c); +} +inline unsigned short to_unsigned_or_bool(short i) +{ + return static_cast(i) + static_cast(1 << (sizeof(short) * 8 - 1)); +} +inline unsigned short to_unsigned_or_bool(unsigned short i) +{ + return i; +} +inline unsigned int to_unsigned_or_bool(int i) +{ + return static_cast(i) + static_cast(1 << (sizeof(int) * 8 - 1)); +} +inline unsigned int to_unsigned_or_bool(unsigned int i) +{ + return i; +} +inline unsigned long to_unsigned_or_bool(long l) +{ + return static_cast(l) + static_cast(1l << (sizeof(long) * 8 - 1)); +} +inline unsigned long to_unsigned_or_bool(unsigned long l) +{ + return l; +} +inline unsigned long long to_unsigned_or_bool(long long l) +{ + return static_cast(l) + static_cast(1ll << (sizeof(long long) * 8 - 1)); +} +inline unsigned long long to_unsigned_or_bool(unsigned long long l) +{ + return l; +} +inline std::uint32_t to_unsigned_or_bool(float f) +{ + union + { + float f; + std::uint32_t u; + } as_union = { f }; + std::uint32_t sign_bit = -std::int32_t(as_union.u >> 31); + return as_union.u ^ (sign_bit | 0x80000000); +} +inline std::uint64_t to_unsigned_or_bool(double f) +{ + union + { + double d; + std::uint64_t u; + } as_union = { f }; + std::uint64_t sign_bit = -std::int64_t(as_union.u >> 63); + return as_union.u ^ (sign_bit | 0x8000000000000000); +} +template +inline size_t to_unsigned_or_bool(T * ptr) +{ + return reinterpret_cast(ptr); +} + +template +struct SizedRadixSorter; + +template<> +struct SizedRadixSorter<1> +{ + template + static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key) + { + counting_sort_impl(begin, end, buffer_begin, [&](auto && o) + { + return to_unsigned_or_bool(extract_key(o)); + }); + return true; + } + + static constexpr size_t pass_count = 2; +}; +template<> +struct SizedRadixSorter<2> +{ + template + static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key) + { + std::ptrdiff_t num_elements = end - begin; + if (num_elements <= (1ll << 32)) + return sort_inline(begin, end, buffer_begin, buffer_begin + num_elements, extract_key); + else + return sort_inline(begin, end, buffer_begin, buffer_begin + num_elements, extract_key); + } + + template + static bool sort_inline(It begin, It end, OutIt out_begin, OutIt out_end, ExtractKey && extract_key) + { + count_type counts0[256] = {}; + count_type counts1[256] = {}; + + for (It it = begin; it != end; ++it) + { + uint16_t key = to_unsigned_or_bool(extract_key(*it)); + ++counts0[key & 0xff]; + ++counts1[(key >> 8) & 0xff]; + } + count_type total0 = 0; + count_type total1 = 0; + for (int i = 0; i < 256; ++i) + { + count_type old_count0 = counts0[i]; + count_type old_count1 = counts1[i]; + counts0[i] = total0; + counts1[i] = total1; + total0 += old_count0; + total1 += old_count1; + } + for (It it = begin; it != end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)); + out_begin[counts0[key]++] = std::move(*it); + } + for (OutIt it = out_begin; it != out_end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 8; + begin[counts1[key]++] = std::move(*it); + } + return false; + } + + static constexpr size_t pass_count = 3; +}; +template<> +struct SizedRadixSorter<4> +{ + + template + static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key) + { + std::ptrdiff_t num_elements = end - begin; + if (num_elements <= (1ll << 32)) + return sort_inline(begin, end, buffer_begin, buffer_begin + num_elements, extract_key); + else + return sort_inline(begin, end, buffer_begin, buffer_begin + num_elements, extract_key); + } + template + static bool sort_inline(It begin, It end, OutIt out_begin, OutIt out_end, ExtractKey && extract_key) + { + count_type counts0[256] = {}; + count_type counts1[256] = {}; + count_type counts2[256] = {}; + count_type counts3[256] = {}; + + for (It it = begin; it != end; ++it) + { + uint32_t key = to_unsigned_or_bool(extract_key(*it)); + ++counts0[key & 0xff]; + ++counts1[(key >> 8) & 0xff]; + ++counts2[(key >> 16) & 0xff]; + ++counts3[(key >> 24) & 0xff]; + } + count_type total0 = 0; + count_type total1 = 0; + count_type total2 = 0; + count_type total3 = 0; + for (int i = 0; i < 256; ++i) + { + count_type old_count0 = counts0[i]; + count_type old_count1 = counts1[i]; + count_type old_count2 = counts2[i]; + count_type old_count3 = counts3[i]; + counts0[i] = total0; + counts1[i] = total1; + counts2[i] = total2; + counts3[i] = total3; + total0 += old_count0; + total1 += old_count1; + total2 += old_count2; + total3 += old_count3; + } + for (It it = begin; it != end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)); + out_begin[counts0[key]++] = std::move(*it); + } + for (OutIt it = out_begin; it != out_end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 8; + begin[counts1[key]++] = std::move(*it); + } + for (It it = begin; it != end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 16; + out_begin[counts2[key]++] = std::move(*it); + } + for (OutIt it = out_begin; it != out_end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 24; + begin[counts3[key]++] = std::move(*it); + } + return false; + } + + static constexpr size_t pass_count = 5; +}; +template<> +struct SizedRadixSorter<8> +{ + template + static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key) + { + std::ptrdiff_t num_elements = end - begin; + if (num_elements <= (1ll << 32)) + return sort_inline(begin, end, buffer_begin, buffer_begin + num_elements, extract_key); + else + return sort_inline(begin, end, buffer_begin, buffer_begin + num_elements, extract_key); + } + template + static bool sort_inline(It begin, It end, OutIt out_begin, OutIt out_end, ExtractKey && extract_key) + { + count_type counts0[256] = {}; + count_type counts1[256] = {}; + count_type counts2[256] = {}; + count_type counts3[256] = {}; + count_type counts4[256] = {}; + count_type counts5[256] = {}; + count_type counts6[256] = {}; + count_type counts7[256] = {}; + + for (It it = begin; it != end; ++it) + { + uint64_t key = to_unsigned_or_bool(extract_key(*it)); + ++counts0[key & 0xff]; + ++counts1[(key >> 8) & 0xff]; + ++counts2[(key >> 16) & 0xff]; + ++counts3[(key >> 24) & 0xff]; + ++counts4[(key >> 32) & 0xff]; + ++counts5[(key >> 40) & 0xff]; + ++counts6[(key >> 48) & 0xff]; + ++counts7[(key >> 56) & 0xff]; + } + count_type total0 = 0; + count_type total1 = 0; + count_type total2 = 0; + count_type total3 = 0; + count_type total4 = 0; + count_type total5 = 0; + count_type total6 = 0; + count_type total7 = 0; + for (int i = 0; i < 256; ++i) + { + count_type old_count0 = counts0[i]; + count_type old_count1 = counts1[i]; + count_type old_count2 = counts2[i]; + count_type old_count3 = counts3[i]; + count_type old_count4 = counts4[i]; + count_type old_count5 = counts5[i]; + count_type old_count6 = counts6[i]; + count_type old_count7 = counts7[i]; + counts0[i] = total0; + counts1[i] = total1; + counts2[i] = total2; + counts3[i] = total3; + counts4[i] = total4; + counts5[i] = total5; + counts6[i] = total6; + counts7[i] = total7; + total0 += old_count0; + total1 += old_count1; + total2 += old_count2; + total3 += old_count3; + total4 += old_count4; + total5 += old_count5; + total6 += old_count6; + total7 += old_count7; + } + for (It it = begin; it != end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)); + out_begin[counts0[key]++] = std::move(*it); + } + for (OutIt it = out_begin; it != out_end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 8; + begin[counts1[key]++] = std::move(*it); + } + for (It it = begin; it != end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 16; + out_begin[counts2[key]++] = std::move(*it); + } + for (OutIt it = out_begin; it != out_end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 24; + begin[counts3[key]++] = std::move(*it); + } + for (It it = begin; it != end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 32; + out_begin[counts4[key]++] = std::move(*it); + } + for (OutIt it = out_begin; it != out_end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 40; + begin[counts5[key]++] = std::move(*it); + } + for (It it = begin; it != end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 48; + out_begin[counts6[key]++] = std::move(*it); + } + for (OutIt it = out_begin; it != out_end; ++it) + { + std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 56; + begin[counts7[key]++] = std::move(*it); + } + return false; + } + + static constexpr size_t pass_count = 9; +}; + +template +struct RadixSorter; +template<> +struct RadixSorter +{ + template + static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key) + { + size_t false_count = 0; + for (It it = begin; it != end; ++it) + { + if (!extract_key(*it)) + ++false_count; + } + size_t true_position = false_count; + false_count = 0; + for (; begin != end; ++begin) + { + if (extract_key(*begin)) + buffer_begin[true_position++] = std::move(*begin); + else + buffer_begin[false_count++] = std::move(*begin); + } + return true; + } + + static constexpr size_t pass_count = 2; +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template<> +struct RadixSorter : SizedRadixSorter +{ +}; +template +struct RadixSorter > +{ + template + static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key) + { + bool first_result = RadixSorter::sort(begin, end, buffer_begin, [&](auto && o) + { + return extract_key(o).second; + }); + auto extract_first = [&](auto && o) + { + return extract_key(o).first; + }; + + if (first_result) + { + return !RadixSorter::sort(buffer_begin, buffer_begin + (end - begin), begin, extract_first); + } + else + { + return RadixSorter::sort(begin, end, buffer_begin, extract_first); + } + } + + static constexpr size_t pass_count = RadixSorter::pass_count + RadixSorter::pass_count; +}; +template +struct RadixSorter &> +{ + template + static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key) + { + bool first_result = RadixSorter::sort(begin, end, buffer_begin, [&](auto && o) -> const V & + { + return extract_key(o).second; + }); + auto extract_first = [&](auto && o) -> const K & + { + return extract_key(o).first; + }; + + if (first_result) + { + return !RadixSorter::sort(buffer_begin, buffer_begin + (end - begin), begin, extract_first); + } + else + { + return RadixSorter::sort(begin, end, buffer_begin, extract_first); + } + } + + static constexpr size_t pass_count = RadixSorter::pass_count + RadixSorter::pass_count; +}; +template +struct TupleRadixSorter +{ + using NextSorter = TupleRadixSorter; + using ThisSorter = RadixSorter::type>; + + template + static bool sort(It begin, It end, OutIt out_begin, OutIt out_end, ExtractKey && extract_key) + { + bool which = NextSorter::sort(begin, end, out_begin, out_end, extract_key); + auto extract_i = [&](auto && o) + { + return std::get(extract_key(o)); + }; + if (which) + return !ThisSorter::sort(out_begin, out_end, begin, extract_i); + else + return ThisSorter::sort(begin, end, out_begin, extract_i); + } + + static constexpr size_t pass_count = ThisSorter::pass_count + NextSorter::pass_count; +}; +template +struct TupleRadixSorter +{ + using NextSorter = TupleRadixSorter; + using ThisSorter = RadixSorter::type>; + + template + static bool sort(It begin, It end, OutIt out_begin, OutIt out_end, ExtractKey && extract_key) + { + bool which = NextSorter::sort(begin, end, out_begin, out_end, extract_key); + auto extract_i = [&](auto && o) -> decltype(auto) + { + return std::get(extract_key(o)); + }; + if (which) + return !ThisSorter::sort(out_begin, out_end, begin, extract_i); + else + return ThisSorter::sort(begin, end, out_begin, extract_i); + } + + static constexpr size_t pass_count = ThisSorter::pass_count + NextSorter::pass_count; +}; +template +struct TupleRadixSorter +{ + template + static bool sort(It, It, OutIt, OutIt, ExtractKey &&) + { + return false; + } + + static constexpr size_t pass_count = 0; +}; +template +struct TupleRadixSorter +{ + template + static bool sort(It, It, OutIt, OutIt, ExtractKey &&) + { + return false; + } + + static constexpr size_t pass_count = 0; +}; + +template +struct RadixSorter > +{ + using SorterImpl = TupleRadixSorter<0, sizeof...(Args), std::tuple >; + + template + static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key) + { + return SorterImpl::sort(begin, end, buffer_begin, buffer_begin + (end - begin), extract_key); + } + + static constexpr size_t pass_count = SorterImpl::pass_count; +}; + +template +struct RadixSorter &> +{ + using SorterImpl = TupleRadixSorter<0, sizeof...(Args), const std::tuple &>; + + template + static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key) + { + return SorterImpl::sort(begin, end, buffer_begin, buffer_begin + (end - begin), extract_key); + } + + static constexpr size_t pass_count = SorterImpl::pass_count; +}; + +template +struct RadixSorter > +{ + template + static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key) + { + auto buffer_end = buffer_begin + (end - begin); + bool which = false; + for (size_t i = S; i > 0; --i) + { + auto extract_i = [&, i = i - 1](auto && o) + { + return extract_key(o)[i]; + }; + if (which) + which = !RadixSorter::sort(buffer_begin, buffer_end, begin, extract_i); + else + which = RadixSorter::sort(begin, end, buffer_begin, extract_i); + } + return which; + } + + static constexpr size_t pass_count = RadixSorter::pass_count * S; +}; + +template +struct RadixSorter : RadixSorter +{ +}; +template +struct RadixSorter : RadixSorter +{ +}; +template +struct RadixSorter : RadixSorter +{ +}; +template +struct RadixSorter : RadixSorter +{ +}; +template +struct RadixSorter : RadixSorter +{ +}; +// these structs serve two purposes +// 1. they serve as illustration for how to implement the to_radix_sort_key function +// 2. they help produce better error messages. with these overloads you get the +// error message "no matching function for call to to_radix_sort(your_type)" +// without these examples, you'd get the error message "to_radix_sort_key was +// not declared in this scope" which is a much less useful error message +struct ExampleStructA { int i; }; +struct ExampleStructB { float f; }; +inline int to_radix_sort_key(ExampleStructA a) { return a.i; } +inline float to_radix_sort_key(ExampleStructB b) { return b.f; } +template +struct FallbackRadixSorter : RadixSorter()))> +{ + using base = RadixSorter()))>; + + template + static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key) + { + return base::sort(begin, end, buffer_begin, [&](auto && a) -> decltype(auto) + { + return to_radix_sort_key(extract_key(a)); + }); + } +}; + +template +struct nested_void +{ + using type = void; +}; + +template +using void_t = typename nested_void::type; + +template +struct has_subscript_operator_impl +{ + template()[0])> + static std::true_type test(int); + template + static std::false_type test(...); + + using type = decltype(test(0)); +}; + +template +using has_subscript_operator = typename has_subscript_operator_impl::type; + + +template +struct FallbackRadixSorter()))> > + : RadixSorter()))> +{ +}; + +template +struct RadixSorter : FallbackRadixSorter +{ +}; + +template +size_t radix_sort_pass_count = RadixSorter::pass_count; + +template +inline void unroll_loop_four_times(It begin, size_t iteration_count, Func && to_call) +{ + size_t loop_count = iteration_count / 4; + size_t remainder_count = iteration_count - loop_count * 4; + for (; loop_count > 0; --loop_count) + { + to_call(begin); + ++begin; + to_call(begin); + ++begin; + to_call(begin); + ++begin; + to_call(begin); + ++begin; + } + switch(remainder_count) + { + case 3: + to_call(begin); + ++begin; + case 2: + to_call(begin); + ++begin; + case 1: + to_call(begin); + } +} + +template +inline It custom_std_partition(It begin, It end, F && func) +{ + for (;; ++begin) + { + if (begin == end) + return end; + if (!func(*begin)) + break; + } + It it = begin; + for(++it; it != end; ++it) + { + if (!func(*it)) + continue; + + std::iter_swap(begin, it); + ++begin; + } + return begin; +} + +struct PartitionInfo +{ + PartitionInfo() + : count(0) + { + } + + union + { + size_t count; + size_t offset; + }; + size_t next_offset; +}; + +template +struct UnsignedForSize; +template<> +struct UnsignedForSize<1> +{ + typedef uint8_t type; +}; +template<> +struct UnsignedForSize<2> +{ + typedef uint16_t type; +}; +template<> +struct UnsignedForSize<4> +{ + typedef uint32_t type; +}; +template<> +struct UnsignedForSize<8> +{ + typedef uint64_t type; +}; +template +struct SubKey; +template +struct SizedSubKey +{ + template + static auto sub_key(T && value, void *) + { + return to_unsigned_or_bool(value); + } + + typedef SubKey next; + + using sub_key_type = typename UnsignedForSize::type; +}; +template +struct SubKey : SubKey +{ +}; +template +struct SubKey : SubKey +{ +}; +template +struct SubKey : SubKey +{ +}; +template +struct SubKey : SubKey +{ +}; +template +struct SubKey : SubKey +{ +}; +template +struct FallbackSubKey + : SubKey()))> +{ + using base = SubKey()))>; + + template + static decltype(auto) sub_key(U && value, void * data) + { + return base::sub_key(to_radix_sort_key(value), data); + } +}; +template +struct FallbackSubKey()))> > + : SubKey()))> +{ +}; +template +struct SubKey : FallbackSubKey +{ +}; +template<> +struct SubKey +{ + template + static bool sub_key(T && value, void *) + { + return value; + } + + typedef SubKey next; + + using sub_key_type = bool; +}; +template<> +struct SubKey; +template<> +struct SubKey : SizedSubKey +{ +}; +template<> +struct SubKey : SizedSubKey +{ +}; +template<> +struct SubKey : SizedSubKey +{ +}; +template<> +struct SubKey : SizedSubKey +{ +}; +template<> +struct SubKey : SizedSubKey +{ +}; +template +struct SubKey : SizedSubKey +{ +}; +template +struct PairSecondSubKey : Current +{ + static decltype(auto) sub_key(const std::pair & value, void * sort_data) + { + return Current::sub_key(value.second, sort_data); + } + + using next = typename std::conditional, typename Current::next>::value, SubKey, PairSecondSubKey >::type; +}; +template +struct PairFirstSubKey : Current +{ + static decltype(auto) sub_key(const std::pair & value, void * sort_data) + { + return Current::sub_key(value.first, sort_data); + } + + using next = typename std::conditional, typename Current::next>::value, PairSecondSubKey >, PairFirstSubKey >::type; +}; +template +struct SubKey > : PairFirstSubKey > +{ +}; +template +struct TypeAt : TypeAt +{ +}; +template +struct TypeAt<0, First, More...> +{ + typedef First type; +}; + +template +struct TupleSubKey; + +template +struct NextTupleSubKey +{ + using type = TupleSubKey; +}; +template +struct NextTupleSubKey, First, Second, More...> +{ + using type = TupleSubKey, Second, More...>; +}; +template +struct NextTupleSubKey, First> +{ + using type = SubKey; +}; + +template +struct TupleSubKey : Current +{ + template + static decltype(auto) sub_key(const Tuple & value, void * sort_data) + { + return Current::sub_key(std::get(value), sort_data); + } + + using next = typename NextTupleSubKey::type; +}; +template +struct TupleSubKey : Current +{ + template + static decltype(auto) sub_key(const Tuple & value, void * sort_data) + { + return Current::sub_key(std::get(value), sort_data); + } + + using next = typename NextTupleSubKey::type; +}; +template +struct SubKey > : TupleSubKey<0, SubKey, First, More...> +{ +}; + +struct BaseListSortData +{ + size_t current_index; + size_t recursion_limit; + void * next_sort_data; +}; +template +struct ListSortData : BaseListSortData +{ + void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *); +}; + +template +struct ListElementSubKey : SubKey()[0])>::type> +{ + using base = SubKey()[0])>::type>; + + using next = ListElementSubKey; + + template + static decltype(auto) sub_key(U && value, void * sort_data) + { + BaseListSortData * list_sort_data = static_cast(sort_data); + const T & list = CurrentSubKey::sub_key(value, list_sort_data->next_sort_data); + return base::sub_key(list[list_sort_data->current_index], list_sort_data->next_sort_data); + } +}; + +template +struct ListSubKey +{ + using next = SubKey; + + using sub_key_type = T; + + static const T & sub_key(const T & value, void *) + { + return value; + } +}; + +template +struct FallbackSubKey::value>::type> : ListSubKey +{ +}; + +template +inline void StdSortFallback(It begin, It end, ExtractKey & extract_key) +{ + std::sort(begin, end, [&](auto && l, auto && r){ return extract_key(l) < extract_key(r); }); +} + +template +inline bool StdSortIfLessThanThreshold(It begin, It end, std::ptrdiff_t num_elements, ExtractKey & extract_key) +{ + if (num_elements <= 1) + return true; + if (num_elements >= StdSortThreshold) + return false; + StdSortFallback(begin, end, extract_key); + return true; +} + +template +struct InplaceSorter; + +template +struct UnsignedInplaceSorter +{ + static constexpr size_t ShiftAmount = (((NumBytes - 1) - Offset) * 8); + template + inline static uint8_t current_byte(T && elem, void * sort_data) + { + return CurrentSubKey::sub_key(elem, sort_data) >> ShiftAmount; + } + template + static void sort(It begin, It end, std::ptrdiff_t num_elements, ExtractKey & extract_key, void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *), void * sort_data) + { + if (num_elements < AmericanFlagSortThreshold) + american_flag_sort(begin, end, extract_key, next_sort, sort_data); + else + ska_byte_sort(begin, end, extract_key, next_sort, sort_data); + } + + template + static void american_flag_sort(It begin, It end, ExtractKey & extract_key, void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *), void * sort_data) + { + PartitionInfo partitions[256]; + for (It it = begin; it != end; ++it) + { + ++partitions[current_byte(extract_key(*it), sort_data)].count; + } + size_t total = 0; + uint8_t remaining_partitions[256]; + int num_partitions = 0; + for (int i = 0; i < 256; ++i) + { + size_t count = partitions[i].count; + if (!count) + continue; + partitions[i].offset = total; + total += count; + partitions[i].next_offset = total; + remaining_partitions[num_partitions] = i; + ++num_partitions; + } + if (num_partitions > 1) + { + uint8_t * current_block_ptr = remaining_partitions; + PartitionInfo * current_block = partitions + *current_block_ptr; + uint8_t * last_block = remaining_partitions + num_partitions - 1; + It it = begin; + It block_end = begin + current_block->next_offset; + It last_element = end - 1; + for (;;) + { + PartitionInfo * block = partitions + current_byte(extract_key(*it), sort_data); + if (block == current_block) + { + ++it; + if (it == last_element) + break; + else if (it == block_end) + { + for (;;) + { + ++current_block_ptr; + if (current_block_ptr == last_block) + goto recurse; + current_block = partitions + *current_block_ptr; + if (current_block->offset != current_block->next_offset) + break; + } + + it = begin + current_block->offset; + block_end = begin + current_block->next_offset; + } + } + else + { + size_t offset = block->offset++; + std::iter_swap(it, begin + offset); + } + } + } + recurse: + if (Offset + 1 != NumBytes || next_sort) + { + size_t start_offset = 0; + It partition_begin = begin; + for (uint8_t * it = remaining_partitions, * end = remaining_partitions + num_partitions; it != end; ++it) + { + size_t end_offset = partitions[*it].next_offset; + It partition_end = begin + end_offset; + std::ptrdiff_t num_elements = end_offset - start_offset; + if (!StdSortIfLessThanThreshold(partition_begin, partition_end, num_elements, extract_key)) + { + UnsignedInplaceSorter::sort(partition_begin, partition_end, num_elements, extract_key, next_sort, sort_data); + } + start_offset = end_offset; + partition_begin = partition_end; + } + } + } + + template + static void ska_byte_sort(It begin, It end, ExtractKey & extract_key, void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *), void * sort_data) + { + PartitionInfo partitions[256]; + for (It it = begin; it != end; ++it) + { + ++partitions[current_byte(extract_key(*it), sort_data)].count; + } + uint8_t remaining_partitions[256]; + size_t total = 0; + int num_partitions = 0; + for (int i = 0; i < 256; ++i) + { + size_t count = partitions[i].count; + if (count) + { + partitions[i].offset = total; + total += count; + remaining_partitions[num_partitions] = i; + ++num_partitions; + } + partitions[i].next_offset = total; + } + for (uint8_t * last_remaining = remaining_partitions + num_partitions, * end_partition = remaining_partitions + 1; last_remaining > end_partition;) + { + last_remaining = custom_std_partition(remaining_partitions, last_remaining, [&](uint8_t partition) + { + size_t & begin_offset = partitions[partition].offset; + size_t & end_offset = partitions[partition].next_offset; + if (begin_offset == end_offset) + return false; + + unroll_loop_four_times(begin + begin_offset, end_offset - begin_offset, [partitions = partitions, begin, &extract_key, sort_data](It it) + { + uint8_t this_partition = current_byte(extract_key(*it), sort_data); + size_t offset = partitions[this_partition].offset++; + std::iter_swap(it, begin + offset); + }); + return begin_offset != end_offset; + }); + } + if (Offset + 1 != NumBytes || next_sort) + { + for (uint8_t * it = remaining_partitions + num_partitions; it != remaining_partitions; --it) + { + uint8_t partition = it[-1]; + size_t start_offset = (partition == 0 ? 0 : partitions[partition - 1].next_offset); + size_t end_offset = partitions[partition].next_offset; + It partition_begin = begin + start_offset; + It partition_end = begin + end_offset; + std::ptrdiff_t num_elements = end_offset - start_offset; + if (!StdSortIfLessThanThreshold(partition_begin, partition_end, num_elements, extract_key)) + { + UnsignedInplaceSorter::sort(partition_begin, partition_end, num_elements, extract_key, next_sort, sort_data); + } + } + } + } +}; + +template +struct UnsignedInplaceSorter +{ + template + inline static void sort(It begin, It end, std::ptrdiff_t num_elements, ExtractKey & extract_key, void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *), void * next_sort_data) + { + next_sort(begin, end, num_elements, extract_key, next_sort_data); + } +}; + +template +size_t CommonPrefix(It begin, It end, size_t start_index, ExtractKey && extract_key, ElementKey && element_key) +{ + const auto & largest_match_list = extract_key(*begin); + size_t largest_match = largest_match_list.size(); + if (largest_match == start_index) + return start_index; + for (++begin; begin != end; ++begin) + { + const auto & current_list = extract_key(*begin); + size_t current_size = current_list.size(); + if (current_size < largest_match) + { + largest_match = current_size; + if (largest_match == start_index) + return start_index; + } + if (element_key(largest_match_list[start_index]) != element_key(current_list[start_index])) + return start_index; + for (size_t i = start_index + 1; i < largest_match; ++i) + { + if (element_key(largest_match_list[i]) != element_key(current_list[i])) + { + largest_match = i; + break; + } + } + } + return largest_match; +} + +template +struct ListInplaceSorter +{ + using ElementSubKey = ListElementSubKey; + template + static void sort(It begin, It end, ExtractKey & extract_key, ListSortData * sort_data) + { + size_t current_index = sort_data->current_index; + void * next_sort_data = sort_data->next_sort_data; + auto current_key = [&](auto && elem) -> decltype(auto) + { + return CurrentSubKey::sub_key(extract_key(elem), next_sort_data); + }; + auto element_key = [&](auto && elem) -> decltype(auto) + { + return ElementSubKey::base::sub_key(elem, sort_data); + }; + sort_data->current_index = current_index = CommonPrefix(begin, end, current_index, current_key, element_key); + It end_of_shorter_ones = std::partition(begin, end, [&](auto && elem) + { + return current_key(elem).size() <= current_index; + }); + std::ptrdiff_t num_shorter_ones = end_of_shorter_ones - begin; + if (sort_data->next_sort && !StdSortIfLessThanThreshold(begin, end_of_shorter_ones, num_shorter_ones, extract_key)) + { + sort_data->next_sort(begin, end_of_shorter_ones, num_shorter_ones, extract_key, next_sort_data); + } + std::ptrdiff_t num_elements = end - end_of_shorter_ones; + if (!StdSortIfLessThanThreshold(end_of_shorter_ones, end, num_elements, extract_key)) + { + void (*sort_next_element)(It, It, std::ptrdiff_t, ExtractKey &, void *) = static_cast(&sort_from_recursion); + InplaceSorter::sort(end_of_shorter_ones, end, num_elements, extract_key, sort_next_element, sort_data); + } + } + + template + static void sort_from_recursion(It begin, It end, std::ptrdiff_t, ExtractKey & extract_key, void * next_sort_data) + { + ListSortData offset = *static_cast *>(next_sort_data); + ++offset.current_index; + --offset.recursion_limit; + if (offset.recursion_limit == 0) + { + StdSortFallback(begin, end, extract_key); + } + else + { + sort(begin, end, extract_key, &offset); + } + } + + + template + static void sort(It begin, It end, std::ptrdiff_t, ExtractKey & extract_key, void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *), void * next_sort_data) + { + ListSortData offset; + offset.current_index = 0; + offset.recursion_limit = 16; + offset.next_sort = next_sort; + offset.next_sort_data = next_sort_data; + sort(begin, end, extract_key, &offset); + } +}; + +template +struct InplaceSorter +{ + template + static void sort(It begin, It end, std::ptrdiff_t, ExtractKey & extract_key, void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *), void * sort_data) + { + It middle = std::partition(begin, end, [&](auto && a){ return !CurrentSubKey::sub_key(extract_key(a), sort_data); }); + if (next_sort) + { + next_sort(begin, middle, middle - begin, extract_key, sort_data); + next_sort(middle, end, end - middle, extract_key, sort_data); + } + } +}; + +template +struct InplaceSorter : UnsignedInplaceSorter +{ +}; +template +struct InplaceSorter : UnsignedInplaceSorter +{ +}; +template +struct InplaceSorter : UnsignedInplaceSorter +{ +}; +template +struct InplaceSorter : UnsignedInplaceSorter +{ +}; +template +struct FallbackInplaceSorter; + +template +struct InplaceSorter : FallbackInplaceSorter +{ +}; + +template +struct FallbackInplaceSorter::value>::type> + : ListInplaceSorter +{ +}; + +template +struct SortStarter; +template +struct SortStarter > +{ + template + static void sort(It, It, std::ptrdiff_t, ExtractKey &, void *) + { + } +}; + +template +struct SortStarter +{ + template + static void sort(It begin, It end, std::ptrdiff_t num_elements, ExtractKey & extract_key, void * next_sort_data = nullptr) + { + if (StdSortIfLessThanThreshold(begin, end, num_elements, extract_key)) + return; + + void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *) = static_cast(&SortStarter::sort); + if (next_sort == static_cast(&SortStarter >::sort)) + next_sort = nullptr; + InplaceSorter::sort(begin, end, num_elements, extract_key, next_sort, next_sort_data); + } +}; + +template +void inplace_radix_sort(It begin, It end, ExtractKey & extract_key) +{ + using SubKey = SubKey; + SortStarter::sort(begin, end, end - begin, extract_key); +} + +struct IdentityFunctor +{ + template + decltype(auto) operator()(T && i) const + { + return std::forward(i); + } +}; +} + +template +static void ska_sort(It begin, It end, ExtractKey && extract_key) +{ + detail::inplace_radix_sort<128, 1024>(begin, end, extract_key); +} + +template +static void ska_sort(It begin, It end) +{ + ska_sort(begin, end, detail::IdentityFunctor()); +} + +template +bool ska_sort_copy(It begin, It end, OutIt buffer_begin, ExtractKey && key) +{ + std::ptrdiff_t num_elements = end - begin; + if (num_elements < 128 || detail::radix_sort_pass_count::type> >= 8) + { + ska_sort(begin, end, key); + return false; + } + else + return detail::RadixSorter::type>::sort(begin, end, buffer_begin, key); +} +template +bool ska_sort_copy(It begin, It end, OutIt buffer_begin) +{ + return ska_sort_copy(begin, end, buffer_begin, detail::IdentityFunctor()); +} + +#endif /* __cplusplus */ +COSMOPOLITAN_C_START_ + +void ska_sort_int64(int64_t *, size_t); + +COSMOPOLITAN_C_END_ +#endif /* COSMOPOLITAN_LIBC_MEM_SKA_SORT_H_ */ diff --git a/third_party/libcxx/utils.h b/third_party/libcxx/utils.h new file mode 100644 index 000000000..a3d93c9f7 --- /dev/null +++ b/third_party/libcxx/utils.h @@ -0,0 +1,42 @@ +// -*- c++ -*- +// clang-format off +#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_ +#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_ +#include "third_party/libcxx/iterator" + +namespace learned_sort { +namespace utils { + +// Helper functions for working with unsigned types +template +inline T _get_key(T a) { + return a; +} + +template +void insertion_sort(RandomIt begin, RandomIt end) { + // Determine the data type + typedef typename std::iterator_traits::value_type T; + + // Determine the input size + const size_t input_sz = std::distance(begin, end); + + if (input_sz <= 0) return; + + RandomIt cmp_idx; + T key; + for (auto i = begin + 1; i != end; ++i) { + key = i[0]; + cmp_idx = i - 1; + while (cmp_idx >= begin && cmp_idx[0] > key) { + cmp_idx[1] = cmp_idx[0]; + --cmp_idx; + } + cmp_idx[1] = key; + } +} + +} // namespace utils +} // namespace learned_sort + +#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_ */