diff --git a/test/libc/str/longsort_test.c b/test/libc/str/longsort_test.c
index cf63f780d..ae672711f 100644
--- a/test/libc/str/longsort_test.c
+++ b/test/libc/str/longsort_test.c
@@ -16,6 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/gc.internal.h"
 #include "libc/mem/mem.h"
@@ -25,8 +26,24 @@
 #include "libc/str/str.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
+#include "third_party/libcxx/learned_sort.h"
+#include "third_party/libcxx/pdqsort.h"
+#include "third_party/libcxx/ska_sort.h"
 #include "third_party/vqsort/vqsort.h"
 
+void InsertionSort(long *A, long n) {
+  long i, j, t;
+  for (i = 1; i < n; i++) {
+    t = A[i];
+    j = i - 1;
+    while (j >= 0 && A[j] > t) {
+      A[j + 1] = A[j];
+      j = j - 1;
+    }
+    A[j + 1] = t;
+  }
+}
+
 int CompareLong(const void *a, const void *b) {
   const long *x = a;
   const long *y = b;
@@ -110,25 +127,87 @@ BENCH(_longsort, bench) {
   long *p1 = gc(malloc(n * sizeof(long)));
   long *p2 = gc(malloc(n * sizeof(long)));
   rngset(p1, n * sizeof(long), 0, 0);
-  EZBENCH2("_longsort", memcpy(p2, p1, n * sizeof(long)), _longsort(p2, n));
+  EZBENCH2("_longsort rand", memcpy(p2, p1, n * sizeof(long)),
+           _longsort(p2, n));
   if (X86_HAVE(AVX2)) {
-    EZBENCH2("vqsort_int64_avx2", memcpy(p2, p1, n * sizeof(long)),
+    EZBENCH2("vq_int64_avx2 rand", memcpy(p2, p1, n * sizeof(long)),
              vqsort_int64_avx2(p2, n));
   }
   if (X86_HAVE(SSE4_2)) {
-    EZBENCH2("vqsort_int64_sse4", memcpy(p2, p1, n * sizeof(long)),
+    EZBENCH2("vq_int64_sse4 rand", memcpy(p2, p1, n * sizeof(long)),
              vqsort_int64_sse4(p2, n));
   }
   if (X86_HAVE(SSSE3)) {
-    EZBENCH2("vqsort_int64_ssse3", memcpy(p2, p1, n * sizeof(long)),
+    EZBENCH2("vq_int64_ssse3 rand", memcpy(p2, p1, n * sizeof(long)),
              vqsort_int64_ssse3(p2, n));
   }
-  EZBENCH2("vqsort_int64_sse2", memcpy(p2, p1, n * sizeof(long)),
+  EZBENCH2("vq_int64_sse2 rand", memcpy(p2, p1, n * sizeof(long)),
            vqsort_int64_sse2(p2, n));
-  EZBENCH2("radix_sort_int64", memcpy(p2, p1, n * sizeof(long)),
+  EZBENCH2("radix_int64 rand", memcpy(p2, p1, n * sizeof(long)),
            radix_sort_int64(p2, n));
-  EZBENCH2("qsort(long)", memcpy(p2, p1, n * sizeof(long)),
+  EZBENCH2("ska_sort_int64 rand", memcpy(p2, p1, n * sizeof(long)),
+           ska_sort_int64(p2, n));
+  EZBENCH2("pdq_int64 rand", memcpy(p2, p1, n * sizeof(long)),
+           pdqsort_int64(p2, n));
+  EZBENCH2("pdq_branchless rand", memcpy(p2, p1, n * sizeof(long)),
+           pdqsort_branchless_int64(p2, n));
+  EZBENCH2("learned_int64 rand", memcpy(p2, p1, n * sizeof(long)),
+           learned_sort_int64(p2, n));
+  EZBENCH2("qsort(long) rand", memcpy(p2, p1, n * sizeof(long)),
            qsort(p2, n, sizeof(long), CompareLong));
+  EZBENCH2("InsertionSort rand", memcpy(p2, p1, n * sizeof(long)),
+           InsertionSort(p2, n));
+}
+
+BENCH(_longsort, benchNearlySorted) {
+  printf("\n");
+  long i, j, k, t;
+  long n = 5000;  // total items
+  long m = 500;   // misplaced items
+  long d = 5;     // maximum drift
+  long *p1 = gc(malloc(n * sizeof(long)));
+  long *p2 = gc(malloc(n * sizeof(long)));
+  for (i = 0; i < n; ++i) {
+    p1[i] = i;
+  }
+  for (i = 0; i < m; ++i) {
+    j = rand() % n;
+    k = rand() % (d * 2) - d + j;
+    k = MIN(MAX(0, k), n - 1);
+    t = p1[j];
+    p1[j] = p1[k];
+    p1[k] = t;
+  }
+  EZBENCH2("_longsort near", memcpy(p2, p1, n * sizeof(long)),
+           _longsort(p2, n));
+  if (X86_HAVE(AVX2)) {
+    EZBENCH2("vq_int64_avx2 near", memcpy(p2, p1, n * sizeof(long)),
+             vqsort_int64_avx2(p2, n));
+  }
+  if (X86_HAVE(SSE4_2)) {
+    EZBENCH2("vq_int64_sse4 near", memcpy(p2, p1, n * sizeof(long)),
+             vqsort_int64_sse4(p2, n));
+  }
+  if (X86_HAVE(SSSE3)) {
+    EZBENCH2("vq_int64_ssse3 near", memcpy(p2, p1, n * sizeof(long)),
+             vqsort_int64_ssse3(p2, n));
+  }
+  EZBENCH2("vq_int64_sse2 near", memcpy(p2, p1, n * sizeof(long)),
+           vqsort_int64_sse2(p2, n));
+  EZBENCH2("radix_int64 near", memcpy(p2, p1, n * sizeof(long)),
+           radix_sort_int64(p2, n));
+  EZBENCH2("ska_sort_int64 near", memcpy(p2, p1, n * sizeof(long)),
+           ska_sort_int64(p2, n));
+  EZBENCH2("pdq_int64 near", memcpy(p2, p1, n * sizeof(long)),
+           pdqsort_int64(p2, n));
+  EZBENCH2("pdq_branchless near", memcpy(p2, p1, n * sizeof(long)),
+           pdqsort_branchless_int64(p2, n));
+  EZBENCH2("learned_int64 near", memcpy(p2, p1, n * sizeof(long)),
+           learned_sort_int64(p2, n));
+  EZBENCH2("qsort(long) near", memcpy(p2, p1, n * sizeof(long)),
+           qsort(p2, n, sizeof(long), CompareLong));
+  EZBENCH2("InsertionSort near", memcpy(p2, p1, n * sizeof(long)),
+           InsertionSort(p2, n));
 }
 
 int CompareInt(const void *a, const void *b) {
@@ -215,6 +294,7 @@ BENCH(_intsort, bench) {
   int *p2 = gc(malloc(n * sizeof(int)));
   rngset(p1, n * sizeof(int), 0, 0);
   EZBENCH2("_intsort", memcpy(p2, p1, n * sizeof(int)), _intsort(p2, n));
+  EZBENCH2("djbsort", memcpy(p2, p1, n * sizeof(int)), djbsort(p2, n));
   if (X86_HAVE(AVX2)) {
     EZBENCH2("vqsort_int32_avx2", memcpy(p2, p1, n * sizeof(int)),
              vqsort_int32_avx2(p2, n));
@@ -229,7 +309,6 @@ BENCH(_intsort, bench) {
   }
   EZBENCH2("vqsort_int32_sse2", memcpy(p2, p1, n * sizeof(int)),
            vqsort_int32_sse2(p2, n));
-  EZBENCH2("djbsort", memcpy(p2, p1, n * sizeof(int)), djbsort(p2, n));
   EZBENCH2("radix_sort_int32", memcpy(p2, p1, n * sizeof(int)),
            radix_sort_int32(p2, n));
   EZBENCH2("qsort(int)", memcpy(p2, p1, n * sizeof(int)),
diff --git a/third_party/ggml/llama_util.h b/third_party/ggml/llama_util.h
index 05184945d..2f7dbe0b2 100755
--- a/third_party/ggml/llama_util.h
+++ b/third_party/ggml/llama_util.h
@@ -1,6 +1,6 @@
+// -*- c++ -*-
 // Internal header to be included only by llama.cpp.
 // Contains wrappers around OS interfaces.
-
 #ifndef LLAMA_UTIL_H
 #define LLAMA_UTIL_H
 #include "libc/calls/struct/rlimit.h"
diff --git a/third_party/libcxx/learned_sort.cc b/third_party/libcxx/learned_sort.cc
new file mode 100644
index 000000000..dbe198f8a
--- /dev/null
+++ b/third_party/libcxx/learned_sort.cc
@@ -0,0 +1,25 @@
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8                              :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/libcxx/learned_sort.h"
+
+void learned_sort_int64(int64_t* A, size_t n) {
+  std::vector<int64_t> v(A, A + n);
+  learned_sort::sort(v.begin(), v.end());
+  std::copy(v.begin(), v.end(), A);
+}
diff --git a/third_party/libcxx/learned_sort.h b/third_party/libcxx/learned_sort.h
new file mode 100644
index 000000000..0c3a28e73
--- /dev/null
+++ b/third_party/libcxx/learned_sort.h
@@ -0,0 +1,1100 @@
+// -*- c++ -*-
+#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_H_
+#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_H_
+#ifdef __cplusplus
+#include "third_party/libcxx/algorithm"
+#include "third_party/libcxx/cmath"
+#include "third_party/libcxx/iterator"
+#include "third_party/libcxx/vector"
+#include "third_party/libcxx/rmi.h"
+#include "third_party/libcxx/utils.h"
+// clang-format off
+
+/**
+ * @file learned_sort.h
+ * @author Ani Kristo, Kapil Vaidya
+ * @brief The purpose of this file is to provide an implementation
+ *        of Learned Sort, a model-enhanced sorting algorithm.
+ *
+ * @copyright Copyright (c) 2021 Ani Kristo <anikristo@gmail.com>
+ * @copyright Copyright (C) 2021 Kapil Vaidya <kapilv@mit.edu>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+using namespace std;
+using namespace learned_sort;
+
+namespace learned_sort {
+
+// Parameters
+static constexpr int PRIMARY_FANOUT = 1000;
+static constexpr int SECONDARY_FANOUT = 100;
+static constexpr int PRIMARY_FRAGMENT_CAPACITY = 100;
+static constexpr int SECONDARY_FRAGMENT_CAPACITY = 100;
+static constexpr int REP_CNT_THRESHOLD = 5;
+
+// static struct timespec SubtractTime(struct timespec a, struct timespec b) {
+//   a.tv_sec -= b.tv_sec;
+//   if (a.tv_nsec < b.tv_nsec) {
+//     a.tv_nsec += 1000000000;
+//     a.tv_sec--;
+//   }
+//   a.tv_nsec -= b.tv_nsec;
+//   return a;
+// }
+
+// static time_t ToNanoseconds(struct timespec ts) {
+//   return ts.tv_sec * 1000000000 + ts.tv_nsec;
+// }
+
+template <class RandomIt>
+void sort(RandomIt begin, RandomIt end,
+          TwoLayerRMI<typename iterator_traits<RandomIt>::value_type> &rmi) {
+  //----------------------------------------------------------//
+  //                          INIT                            //
+  //----------------------------------------------------------//
+
+  // Determine the data type
+  typedef typename iterator_traits<RandomIt>::value_type T;
+
+  // Constants
+  const long input_sz = std::distance(begin, end);
+  const long TRAINING_SAMPLE_SZ = rmi.training_sample.size();
+
+  // Keeps track of the number of elements in each bucket
+  long primary_bucket_sizes[PRIMARY_FANOUT]{0};
+
+  // Counts the number of elements that are done going through the
+  // partitioning steps for good
+  long num_elms_finalized = 0;
+
+  // Cache the model parameters
+  const long num_leaf_models = rmi.hp.num_leaf_models;
+  double root_slope = rmi.root_model.slope;
+  double root_intercept = rmi.root_model.intercept;
+  auto num_models = rmi.hp.num_leaf_models;
+  double slopes[num_leaf_models];
+  double intercepts[num_leaf_models];
+  for (auto i = 0; i < num_leaf_models; ++i) {
+    slopes[i] = rmi.leaf_models[i].slope;
+    intercepts[i] = rmi.leaf_models[i].intercept;
+  }
+
+  // struct timespec ts1, ts2;
+
+  //----------------------------------------------------------//
+  //              PARTITION THE KEYS INTO BUCKETS             //
+  //----------------------------------------------------------//
+
+  // clock_gettime(CLOCK_REALTIME, &ts1);
+  {
+    // Keeps track of the number of elements in each fragment
+    long fragment_sizes[PRIMARY_FANOUT]{0};
+
+    // An auxiliary set of fragments where the elements will be partitioned
+    auto fragments = new T[PRIMARY_FANOUT][PRIMARY_FRAGMENT_CAPACITY];
+
+    // Keeps track of the number of fragments that have been written back to the
+    // original array
+    long fragments_written = 0;
+
+    // Points to the next free space where to write back
+    auto write_itr = begin;
+
+    // For each element in the input, predict which bucket it would go to, and
+    // insert to the respective bucket fragment
+    for (auto it = begin; it != end; ++it) {
+      // Predict the model id in the leaf layer of the RMI
+      long pred_bucket_idx = static_cast<long>(std::max(
+          0.,
+          std::min(num_leaf_models - 1., root_slope * it[0] + root_intercept)));
+
+      // Predict the CDF
+      double pred_cdf =
+          slopes[pred_bucket_idx] * it[0] + intercepts[pred_bucket_idx];
+
+      // Get the predicted bucket id
+      pred_bucket_idx = static_cast<long>(std::max(
+          0., std::min(PRIMARY_FANOUT - 1., pred_cdf * PRIMARY_FANOUT)));
+
+      // Place the current element in the predicted fragment
+      fragments[pred_bucket_idx][fragment_sizes[pred_bucket_idx]] = it[0];
+
+      // Update the fragment size and the bucket size
+      primary_bucket_sizes[pred_bucket_idx]++;
+      fragment_sizes[pred_bucket_idx]++;
+
+      if (fragment_sizes[pred_bucket_idx] == PRIMARY_FRAGMENT_CAPACITY) {
+        fragments_written++;
+        // The predicted fragment is full, place in the array and update bucket
+        // size
+        std::move(fragments[pred_bucket_idx],
+                  fragments[pred_bucket_idx] + PRIMARY_FRAGMENT_CAPACITY,
+                  write_itr);
+        write_itr += PRIMARY_FRAGMENT_CAPACITY;
+
+        // Reset the fragment size
+        fragment_sizes[pred_bucket_idx] = 0;
+      }
+    }
+
+    //----------------------------------------------------------//
+    //                     DEFRAGMENTATION                      //
+    //----------------------------------------------------------//
+
+    // Records the ending offset for the buckets
+    long bucket_end_offset[PRIMARY_FANOUT]{0};
+    bucket_end_offset[0] = primary_bucket_sizes[0];
+
+    // Swap space
+    T *swap_buffer = new T[PRIMARY_FRAGMENT_CAPACITY];
+
+    // Maintains a writing iterator for each bucket, initialized at the starting
+    // offsets
+    long bucket_write_off[PRIMARY_FANOUT]{0};
+    bucket_write_off[0] = 0;
+
+    // Calculate the starting and ending offsets of each bucket
+    for (long bucket_idx = 1; bucket_idx < PRIMARY_FANOUT; ++bucket_idx) {
+      // Calculate the bucket end offsets (prefix sum)
+      bucket_end_offset[bucket_idx] =
+          primary_bucket_sizes[bucket_idx] + bucket_end_offset[bucket_idx - 1];
+
+      // Calculate the bucket start offsets and assign the writing iterator to
+      // that value
+
+      // These offsets are aligned w.r.t. PRIMARY_FRAGMENT_CAPACITY
+      bucket_write_off[bucket_idx] = ceil(bucket_end_offset[bucket_idx - 1] *
+                                          1. / PRIMARY_FRAGMENT_CAPACITY) *
+                                     PRIMARY_FRAGMENT_CAPACITY;
+
+      // Fence the bucket iterator. This might occur because the write offset is
+      // not necessarily aligned with PRIMARY_FRAGMENT_CAPACITY
+      if (bucket_write_off[bucket_idx] > bucket_end_offset[bucket_idx]) {
+        bucket_write_off[bucket_idx] = bucket_end_offset[bucket_idx];
+      }
+    }
+
+    // This keeps track of which bucket we are operating on
+    long stored_bucket_idx = 0;
+
+    // Go over each fragment and re-arrange them so that they are placed
+    // contiguously within each bucket boundaries
+    for (long fragment_idx = 0; fragment_idx < fragments_written;
+         ++fragment_idx) {
+      auto cur_fragment_start_off = fragment_idx * PRIMARY_FRAGMENT_CAPACITY;
+
+      // Find the bucket where this fragment is stored into, which is not
+      // necessarily the bucket it belongs to
+      while (cur_fragment_start_off >= bucket_end_offset[stored_bucket_idx]) {
+        ++stored_bucket_idx;
+      }
+
+      // Skip this fragment if its starting index is lower than the writing
+      // offset for the current bucket. This means that the fragment has already
+      // been dealt with
+      if (cur_fragment_start_off < bucket_write_off[stored_bucket_idx]) {
+        continue;
+      }
+
+      // Find out what bucket the current fragment belongs to by looking at RMI
+      // prediction for the first element of the fragment.
+      auto first_elm_in_fragment = begin[cur_fragment_start_off];
+      long pred_bucket_for_cur_fragment = static_cast<long>(std::max(
+          0., std::min(num_leaf_models - 1.,
+                       root_slope * first_elm_in_fragment + root_intercept)));
+
+      // Predict the CDF
+      double pred_cdf =
+          slopes[pred_bucket_for_cur_fragment] * first_elm_in_fragment +
+          intercepts[pred_bucket_for_cur_fragment];
+
+      // Get the predicted bucket id
+      pred_bucket_for_cur_fragment = static_cast<long>(std::max(
+          0., std::min(PRIMARY_FANOUT - 1., pred_cdf * PRIMARY_FANOUT)));
+
+      // If the current bucket contains fragments that are not all the way full,
+      // no need to use a swap buffer, since there is available space. The first
+      // condition checks whether the current fragment will need to be written
+      // in a bucket that was already consumed. The second condition checks
+      // whether the writing offset of the predicted bucket is beyond the last
+      // element written into the array. This means that there is empty space to
+      // write the fragments to.
+      if (bucket_write_off[pred_bucket_for_cur_fragment] <
+              cur_fragment_start_off ||
+          bucket_write_off[pred_bucket_for_cur_fragment] >=
+              fragments_written * PRIMARY_FRAGMENT_CAPACITY) {
+        // If the current fragment will not be the last one to write in the
+        // predicted bucket
+        if (bucket_write_off[pred_bucket_for_cur_fragment] +
+                PRIMARY_FRAGMENT_CAPACITY <=
+            bucket_end_offset[pred_bucket_for_cur_fragment]) {
+          auto write_itr =
+              begin + bucket_write_off[pred_bucket_for_cur_fragment];
+          auto read_itr = begin + cur_fragment_start_off;
+
+          // Move the elements of the fragment to the bucket's write offset
+          std::copy(read_itr, read_itr + PRIMARY_FRAGMENT_CAPACITY, write_itr);
+
+          // Update the bucket write offset
+          bucket_write_off[pred_bucket_for_cur_fragment] +=
+              PRIMARY_FRAGMENT_CAPACITY;
+
+        } else {  // This is the last fragment to write into the predicted
+                  // bucket
+          auto write_itr =
+              begin + bucket_write_off[pred_bucket_for_cur_fragment];
+          auto read_itr = begin + cur_fragment_start_off;
+
+          // Calculate the fragment size
+          auto cur_fragment_sz =
+              bucket_end_offset[pred_bucket_for_cur_fragment] -
+              bucket_write_off[pred_bucket_for_cur_fragment];
+
+          // Move the elements of the fragment to the bucket's write offset
+          std::copy(read_itr, read_itr + cur_fragment_sz, write_itr);
+
+          // Update the bucket write offset for the predicted bucket
+          bucket_write_off[pred_bucket_for_cur_fragment] =
+              bucket_end_offset[pred_bucket_for_cur_fragment];
+
+          // Place the remaining elements of this fragment into the auxiliary
+          // fragment memory. This is needed when the start of the bucket might
+          // have empty spaces because the writing iterator was not aligned with
+          // FRAGMENT_CAPACITY (not a multiple)
+          for (int elm_idx = cur_fragment_sz;
+               elm_idx < PRIMARY_FRAGMENT_CAPACITY; elm_idx++) {
+            fragments[pred_bucket_for_cur_fragment]
+                     [fragment_sizes[pred_bucket_for_cur_fragment] + elm_idx -
+                      cur_fragment_sz] = read_itr[elm_idx];
+          }
+
+          // Update the auxiliary fragment size
+          fragment_sizes[pred_bucket_for_cur_fragment] +=
+              PRIMARY_FRAGMENT_CAPACITY - cur_fragment_sz;
+        }
+
+      } else {  // The current fragment is to be written in a non-empty space,
+                // so an incumbent fragment will need to be evicted
+
+        // If the fragment is already within the correct bucket
+        if (pred_bucket_for_cur_fragment == stored_bucket_idx) {
+          // If the empty area left in the bucket is not enough for all the
+          // elements in the fragment. This is needed when the start of the
+          // bucket might have empty spaces because the writing iterator was not
+          // aligned with FRAGMENT_CAPACITY (not a multiple)
+          if (bucket_end_offset[stored_bucket_idx] -
+                  bucket_write_off[stored_bucket_idx] <
+              PRIMARY_FRAGMENT_CAPACITY) {
+            auto write_itr = begin + bucket_write_off[stored_bucket_idx];
+            auto read_itr = begin + cur_fragment_start_off;
+
+            // Calculate the amount of space left for the current bucket
+            long remaining_space = bucket_end_offset[stored_bucket_idx] -
+                                   bucket_write_off[stored_bucket_idx];
+
+            // Write out the fragment in the remaining space
+            std::copy(read_itr, read_itr + remaining_space, write_itr);
+
+            // Update the read iterator to point to the remaining elements
+            read_itr = begin + cur_fragment_start_off + remaining_space;
+
+            // Calculate the fragment size
+            auto cur_fragment_sz = fragment_sizes[stored_bucket_idx];
+
+            // Write the remaining elements into the auxiliary fragment space
+            for (int k = 0; k < PRIMARY_FRAGMENT_CAPACITY - remaining_space;
+                 ++k) {
+              fragments[stored_bucket_idx][cur_fragment_sz++] = *(read_itr++);
+            }
+
+            // Update the fragment size after the new placements
+            fragment_sizes[stored_bucket_idx] = cur_fragment_sz;
+
+            // Update the bucket write offset to indicate that the bucket was
+            // fully written
+            bucket_write_off[stored_bucket_idx] =
+                bucket_end_offset[stored_bucket_idx];
+
+          } else {  // The bucket has enough space for the incoming fragment
+
+            auto write_itr = begin + bucket_write_off[stored_bucket_idx];
+            auto read_itr = begin + cur_fragment_start_off;
+
+            if (write_itr != read_itr) {
+              // Write the elements to the bucket's write offset
+              std::copy(read_itr, read_itr + PRIMARY_FRAGMENT_CAPACITY,
+                        write_itr);
+            }
+
+            // Update the write offset for the current bucket
+            bucket_write_off[stored_bucket_idx] += PRIMARY_FRAGMENT_CAPACITY;
+          }
+        } else {  // The fragment is not in the correct bucket and needs to be
+          // swapped out with an incorrectly placed fragment there
+
+          // Predict the bucket of the fragment that will be swapped out
+          auto first_elm_in_fragment_to_be_swapped_out =
+              begin[bucket_write_off[pred_bucket_for_cur_fragment]];
+
+          long pred_bucket_for_fragment_to_be_swapped_out =
+              static_cast<long>(std::max(
+                  0., std::min(
+                          num_leaf_models - 1.,
+                          root_slope * first_elm_in_fragment_to_be_swapped_out +
+                              root_intercept)));
+
+          // Predict the CDF
+          double pred_cdf =
+              slopes[pred_bucket_for_fragment_to_be_swapped_out] *
+                  first_elm_in_fragment_to_be_swapped_out +
+              intercepts[pred_bucket_for_fragment_to_be_swapped_out];
+
+          // Get the predicted bucket idx
+          pred_bucket_for_fragment_to_be_swapped_out = static_cast<long>(
+              std::max(0., std::min(PRIMARY_FANOUT - 1.,
+                                    pred_cdf * PRIMARY_FANOUT)));
+
+          // If the fragment at the next write offset is not already in the
+          // right bucket, swap the fragments
+          if (pred_bucket_for_fragment_to_be_swapped_out !=
+              pred_bucket_for_cur_fragment) {
+            // Move the contents at the write offset into a swap buffer
+            auto itr_buf1 =
+                begin + bucket_write_off[pred_bucket_for_cur_fragment];
+            auto itr_buf2 = begin + cur_fragment_start_off;
+            std::copy(itr_buf1, itr_buf1 + PRIMARY_FRAGMENT_CAPACITY,
+                      swap_buffer);
+
+            // Write the contents of the incoming fragment
+            std::copy(itr_buf2, itr_buf2 + PRIMARY_FRAGMENT_CAPACITY, itr_buf1);
+
+            // Place the swap buffer into the emptied space
+            std::copy(swap_buffer, swap_buffer + PRIMARY_FRAGMENT_CAPACITY,
+                      itr_buf2);
+
+            pred_bucket_for_cur_fragment =
+                pred_bucket_for_fragment_to_be_swapped_out;
+          } else {  // The fragment at the write offset is already in the right
+                    // bucket
+            bucket_write_off[pred_bucket_for_cur_fragment] +=
+                PRIMARY_FRAGMENT_CAPACITY;
+          }
+
+          // Decrement the fragment index so that the newly swapped in fragment
+          // is not skipped over
+          --fragment_idx;
+        }
+      }
+    }
+
+    // Add the elements remaining in the auxiliary fragments to the buckets they
+    // belong to. This is for when the fragments weren't full and thus not
+    // flushed to the input array
+    for (long bucket_idx = 0; bucket_idx < PRIMARY_FANOUT; ++bucket_idx) {
+      // Set the writing offset to the beggining of the bucket
+      long write_off = bucket_end_offset[bucket_idx - 1];
+      if (bucket_idx == 0) {
+        write_off = 0;
+      }
+
+      // Add the elements left in the auxiliary fragments to the beggining of
+      // the predicted bucket, since it was not full
+      long elm_idx;
+      for (elm_idx = 0; (elm_idx < fragment_sizes[bucket_idx]) &&
+                        (write_off % PRIMARY_FRAGMENT_CAPACITY != 0);
+           ++elm_idx) {
+        begin[write_off++] = fragments[bucket_idx][elm_idx];
+      }
+
+      // Add the remaining elements from the auxiliary fragments to the end of
+      // the bucket it belongs to
+      write_off = bucket_end_offset[bucket_idx] - fragment_sizes[bucket_idx];
+      for (; elm_idx < fragment_sizes[bucket_idx]; ++elm_idx) {
+        begin[write_off + elm_idx] = fragments[bucket_idx][elm_idx];
+        ++bucket_write_off[bucket_idx];
+      }
+    }
+
+    // Cleanup
+    delete[] swap_buffer;
+    delete[] fragments;
+  }
+  // clock_gettime(CLOCK_REALTIME, &ts2);
+  // printf("PARTITION THE KEYS INTO BUCKETS %ld us\n",
+  //        ToNanoseconds(SubtractTime(ts2, ts1)) / 1000);
+
+  //----------------------------------------------------------//
+  //                SECOND ROUND OF PARTITIONING              //
+  //----------------------------------------------------------//
+
+  // clock_gettime(CLOCK_REALTIME, &ts1);
+  {
+    // Iterate over each bucket starting from the end so that the merging step
+    // later is done in-place
+    auto primary_bucket_start = begin;
+    auto primary_bucket_end = begin;
+    for (long primary_bucket_idx = 0; primary_bucket_idx < PRIMARY_FANOUT;
+         ++primary_bucket_idx) {
+      auto primary_bucket_sz = primary_bucket_sizes[primary_bucket_idx];
+
+      // Skip bucket if empty
+      if (primary_bucket_sz == 0) continue;
+
+      // Update the start and the end of the bucket data
+      primary_bucket_start = primary_bucket_end;
+      primary_bucket_end += primary_bucket_sz;
+
+      // Check for homogeneity
+      bool is_homogeneous = true;
+      if (rmi.enable_dups_detection) {
+        for (long elm_idx = 1; elm_idx < primary_bucket_sz; ++elm_idx) {
+          if (primary_bucket_start[elm_idx] !=
+              primary_bucket_start[elm_idx - 1]) {
+            is_homogeneous = false;
+            break;
+          }
+        }
+      }
+
+      // When the bucket is homogeneous, skip sorting it
+      if (rmi.enable_dups_detection && is_homogeneous) {
+        num_elms_finalized += primary_bucket_sz;
+
+      }
+
+      // When the bucket is not homogeneous, and it's not flagged for duplicates
+      else {
+        //- - - - - - - - - - - - - - - - - - - - - - - - - - - -  -//
+        //        PARTITION THE KEYS INTO SECONDARY BUCKETS         //
+        //- - - - - - - - - - - - - - - - - - - - - - - - - - - -  -//
+
+        // Keeps track of the number of elements in each secondary bucket
+        long secondary_bucket_sizes[SECONDARY_FANOUT]{0};
+
+        // Keeps track of the number of elements in each fragment
+        long fragment_sizes[SECONDARY_FANOUT]{0};
+
+        // An auxiliary set of fragments where the elements will be partitioned
+        auto fragments = new T[SECONDARY_FANOUT][SECONDARY_FRAGMENT_CAPACITY];
+
+        // Keeps track of the number of fragments that have been written back to
+        // the original array
+        long fragments_written = 0;
+
+        // Points to the next free space where to write back
+        auto write_itr = primary_bucket_start;
+
+        // For each element in the input, predict which bucket it would go to,
+        // and insert to the respective bucket fragment
+        for (auto it = primary_bucket_start; it != primary_bucket_end; ++it) {
+          // Predict the model id in the leaf layer of the RMI
+          long pred_bucket_idx = static_cast<long>(
+              std::max(0., std::min(num_leaf_models - 1.,
+                                    root_slope * it[0] + root_intercept)));
+
+          // Predict the CDF
+          double pred_cdf =
+              slopes[pred_bucket_idx] * it[0] + intercepts[pred_bucket_idx];
+
+          // Get the predicted bucket id
+          pred_bucket_idx = static_cast<long>(std::max(
+              0., std::min(SECONDARY_FANOUT - 1.,
+                           (pred_cdf * PRIMARY_FANOUT - primary_bucket_idx) *
+                               SECONDARY_FANOUT)));
+
+          // Place the current element in the predicted fragment
+          fragments[pred_bucket_idx][fragment_sizes[pred_bucket_idx]] = it[0];
+
+          // Update the fragment size and the bucket size
+          ++secondary_bucket_sizes[pred_bucket_idx];
+          ++fragment_sizes[pred_bucket_idx];
+
+          if (fragment_sizes[pred_bucket_idx] == SECONDARY_FRAGMENT_CAPACITY) {
+            ++fragments_written;
+            // The predicted fragment is full, place in the array and update
+            // bucket size
+            std::move(fragments[pred_bucket_idx],
+                      fragments[pred_bucket_idx] + SECONDARY_FRAGMENT_CAPACITY,
+                      write_itr);
+            write_itr += SECONDARY_FRAGMENT_CAPACITY;
+
+            // Reset the fragment size
+            fragment_sizes[pred_bucket_idx] = 0;
+          }
+        }  // end of partitioninig over secondary fragments
+
+        //- - - - - - - - - - - - - - - - - - - - - - - - - - - -  -//
+        //                      DEFRAGMENTATION                     //
+        //- - - - - - - - - - - - - - - - - - - - - - - - - - - -  -//
+
+        // Records the ending offset for the buckets
+        long bucket_end_offset[SECONDARY_FANOUT]{0};
+        bucket_end_offset[0] = secondary_bucket_sizes[0];
+
+        // Swap space
+        T *swap_buffer = new T[SECONDARY_FRAGMENT_CAPACITY];
+
+        // Maintains a writing iterator for each bucket, initialized at the
+        // starting offsets
+        long bucket_start_off[SECONDARY_FANOUT]{0};
+        bucket_start_off[0] = 0;
+
+        // Calculate the starting and ending offsets of each bucket
+        for (long bucket_idx = 1; bucket_idx < SECONDARY_FANOUT; ++bucket_idx) {
+          // Calculate the bucket end offsets (prefix sum)
+          bucket_end_offset[bucket_idx] = secondary_bucket_sizes[bucket_idx] +
+                                          bucket_end_offset[bucket_idx - 1];
+
+          // Calculate the bucket start offsets and assign the writing
+          // iterator to that value
+
+          // These offsets are aligned w.r.t. SECONDARY_FRAGMENT_CAPACITY
+          bucket_start_off[bucket_idx] =
+              ceil(bucket_end_offset[bucket_idx - 1] * 1. /
+                   SECONDARY_FRAGMENT_CAPACITY) *
+              SECONDARY_FRAGMENT_CAPACITY;
+
+          // Fence the bucket iterator. This might occur because the write
+          // offset is not necessarily aligned with SECONDARY_FRAGMENT_CAPACITY
+          if (bucket_start_off[bucket_idx] > bucket_end_offset[bucket_idx]) {
+            bucket_start_off[bucket_idx] = bucket_end_offset[bucket_idx];
+          }
+        }
+
+        // This keeps track of which bucket we are operating on
+        long stored_bucket_idx = 0;
+
+        // Go over each fragment and re-arrange them so that they are placed
+        // contiguously within each bucket boundaries
+        for (long fragment_idx = 0; fragment_idx < fragments_written;
+             ++fragment_idx) {
+          auto cur_fragment_start_off =
+              fragment_idx * SECONDARY_FRAGMENT_CAPACITY;
+
+          // Find the bucket where this fragment is stored into, which is not
+          // necessarily the bucket it belongs to
+          while (cur_fragment_start_off >=
+                 bucket_end_offset[stored_bucket_idx]) {
+            ++stored_bucket_idx;
+          }
+
+          // Skip this fragment if its starting index is lower than the writing
+          // offset for the current bucket. This means that the fragment has
+          // already been dealt with
+          if (cur_fragment_start_off < bucket_start_off[stored_bucket_idx]) {
+            continue;
+          }
+
+          // Find out what bucket the current fragment belongs to by looking at
+          // RMI prediction for the first element of the fragment.
+          auto first_elm_in_fragment =
+              primary_bucket_start[cur_fragment_start_off];
+          long pred_bucket_for_cur_fragment = static_cast<long>(std::max(
+              0.,
+              std::min(num_leaf_models - 1.,
+                       root_slope * first_elm_in_fragment + root_intercept)));
+
+          // Predict the CDF
+          double pred_cdf =
+              slopes[pred_bucket_for_cur_fragment] * first_elm_in_fragment +
+              intercepts[pred_bucket_for_cur_fragment];
+
+          // Get the predicted bucket id
+          pred_bucket_for_cur_fragment = static_cast<long>(std::max(
+              0., std::min(SECONDARY_FANOUT - 1.,
+                           (pred_cdf * PRIMARY_FANOUT - primary_bucket_idx) *
+                               SECONDARY_FANOUT)));
+
+          // If the current bucket contains fragments that are not all the way
+          // full, no need to use a swap fragment, since there is available
+          // space. The first condition checks whether the current fragment will
+          // need to be written in a bucket that was already consumed. The
+          // second condition checks whether the writing offset of the
+          // predicted bucket is beyond the last element written into the
+          // array. This means that there is empty space to write the fragments
+          // to.
+          if (bucket_start_off[pred_bucket_for_cur_fragment] <
+                  cur_fragment_start_off ||
+              bucket_start_off[pred_bucket_for_cur_fragment] >=
+                  fragments_written * SECONDARY_FRAGMENT_CAPACITY) {
+            // If the current fragment will not be the last one to write in the
+            // predicted bucket
+            if (bucket_start_off[pred_bucket_for_cur_fragment] +
+                    SECONDARY_FRAGMENT_CAPACITY <=
+                bucket_end_offset[pred_bucket_for_cur_fragment]) {
+              auto write_itr = primary_bucket_start +
+                               bucket_start_off[pred_bucket_for_cur_fragment];
+              auto read_itr = primary_bucket_start + cur_fragment_start_off;
+
+              // Move the elements of the fragment to the bucket's write offset
+              std::copy(read_itr, read_itr + SECONDARY_FRAGMENT_CAPACITY,
+                        write_itr);
+
+              // Update the bucket write offset
+              bucket_start_off[pred_bucket_for_cur_fragment] +=
+                  SECONDARY_FRAGMENT_CAPACITY;
+
+            } else {  // This is the last fragment to write into the predicted
+                      // bucket
+              auto write_itr = primary_bucket_start +
+                               bucket_start_off[pred_bucket_for_cur_fragment];
+              auto read_itr = primary_bucket_start + cur_fragment_start_off;
+
+              // Calculate the fragment size
+              auto cur_fragment_sz =
+                  bucket_end_offset[pred_bucket_for_cur_fragment] -
+                  bucket_start_off[pred_bucket_for_cur_fragment];
+
+              // Move the elements of the fragment to the bucket's write offset
+              std::copy(read_itr, read_itr + cur_fragment_sz, write_itr);
+
+              // Update the bucket write offset for the predicted bucket
+              bucket_start_off[pred_bucket_for_cur_fragment] =
+                  bucket_end_offset[pred_bucket_for_cur_fragment];
+
+              // Place the remaining elements of this fragment into the
+              // auxiliary fragment memory. This is needed when the start of the
+              // bucket might have empty spaces because the writing iterator
+              // was not aligned with FRAGMENT_CAPACITY (not a multiple)
+              for (long elm_idx = cur_fragment_sz;
+                   elm_idx < SECONDARY_FRAGMENT_CAPACITY; elm_idx++) {
+                fragments[pred_bucket_for_cur_fragment]
+                         [fragment_sizes[pred_bucket_for_cur_fragment] +
+                          elm_idx - cur_fragment_sz] = read_itr[elm_idx];
+              }
+
+              // Update the auxiliary fragment size
+              fragment_sizes[pred_bucket_for_cur_fragment] +=
+                  SECONDARY_FRAGMENT_CAPACITY - cur_fragment_sz;
+            }
+
+          } else {  // The current fragment is to be written in a non-empty
+                    // space, so an incumbent fragment will need to be evicted
+
+            // If the fragment is already within the correct bucket
+            if (pred_bucket_for_cur_fragment == stored_bucket_idx) {
+              // If the empty area left in the bucket is not enough for all
+              // the elements in the fragment. This is needed when the start of
+              // the bucket might have empty spaces because the writing
+              // iterator was not aligned with FRAGMENT_CAPACITY (not a
+              // multiple)
+              if (bucket_end_offset[stored_bucket_idx] -
+                      bucket_start_off[stored_bucket_idx] <
+                  SECONDARY_FRAGMENT_CAPACITY) {
+                auto write_itr =
+                    primary_bucket_start + bucket_start_off[stored_bucket_idx];
+                auto read_itr = primary_bucket_start + cur_fragment_start_off;
+
+                // Calculate the amount of space left for the current bucket
+                long remaining_space = bucket_end_offset[stored_bucket_idx] -
+                                       bucket_start_off[stored_bucket_idx];
+
+                // Write out the fragment in the remaining space
+                std::copy(read_itr, read_itr + remaining_space, write_itr);
+
+                // Update the read iterator to point to the remaining elements
+                read_itr = primary_bucket_start + cur_fragment_start_off +
+                           remaining_space;
+
+                // Calculate the fragment size
+                auto cur_fragment_sz = fragment_sizes[stored_bucket_idx];
+
+                // Write the remaining elements into the auxiliary fragment
+                // space
+                for (int k = 0;
+                     k < SECONDARY_FRAGMENT_CAPACITY - remaining_space; ++k) {
+                  fragments[stored_bucket_idx][cur_fragment_sz++] =
+                      *(read_itr++);
+                }
+
+                // Update the fragment size after the new placements
+                fragment_sizes[stored_bucket_idx] = cur_fragment_sz;
+
+                // Update the bucket write offset to indicate that the bucket
+                // was fully written
+                bucket_start_off[stored_bucket_idx] =
+                    bucket_end_offset[stored_bucket_idx];
+
+              } else {  // The bucket has enough space for the incoming fragment
+
+                auto write_itr =
+                    primary_bucket_start + bucket_start_off[stored_bucket_idx];
+                auto read_itr = primary_bucket_start + cur_fragment_start_off;
+
+                if (write_itr != read_itr) {
+                  // Write the elements to the bucket's write offset
+                  std::copy(read_itr, read_itr + SECONDARY_FRAGMENT_CAPACITY,
+                            write_itr);
+                }
+
+                // Update the write offset for the current bucket
+                bucket_start_off[stored_bucket_idx] +=
+                    SECONDARY_FRAGMENT_CAPACITY;
+              }
+            } else {  // The fragment is not in the correct bucket and needs to
+                      // be
+              // swapped out with an incorrectly placed fragment there
+
+              // Predict the bucket of the fragment that will be swapped out
+              auto first_elm_in_fragment_to_be_swapped_out =
+                  primary_bucket_start
+                      [bucket_start_off[pred_bucket_for_cur_fragment]];
+
+              long pred_bucket_for_fragment_to_be_swapped_out =
+                  static_cast<long>(std::max(
+                      0.,
+                      std::min(
+                          num_leaf_models - 1.,
+                          root_slope * first_elm_in_fragment_to_be_swapped_out +
+                              root_intercept)));
+
+              // Predict the CDF
+              double pred_cdf =
+                  slopes[pred_bucket_for_fragment_to_be_swapped_out] *
+                      first_elm_in_fragment_to_be_swapped_out +
+                  intercepts[pred_bucket_for_fragment_to_be_swapped_out];
+
+              // Get the predicted bucket idx
+              pred_bucket_for_fragment_to_be_swapped_out = static_cast<long>(
+                  std::max(0., std::min(SECONDARY_FANOUT - 1.,
+                                        (pred_cdf * PRIMARY_FANOUT -
+                                         primary_bucket_idx) *
+                                            SECONDARY_FANOUT)));
+
+              // If the fragment at the next write offset is not already in the
+              // right bucket, swap the fragments
+              if (pred_bucket_for_fragment_to_be_swapped_out !=
+                  pred_bucket_for_cur_fragment) {
+                // Move the contents at the write offset into a swap fragment
+                auto itr_buf1 = primary_bucket_start +
+                                bucket_start_off[pred_bucket_for_cur_fragment];
+                auto itr_buf2 = primary_bucket_start + cur_fragment_start_off;
+                std::copy(itr_buf1, itr_buf1 + SECONDARY_FRAGMENT_CAPACITY,
+                          swap_buffer);
+
+                // Write the contents of the incoming fragment
+                std::copy(itr_buf2, itr_buf2 + SECONDARY_FRAGMENT_CAPACITY,
+                          itr_buf1);
+
+                // Place the swap buffer into the emptied space
+                std::copy(swap_buffer,
+                          swap_buffer + SECONDARY_FRAGMENT_CAPACITY, itr_buf2);
+
+                pred_bucket_for_cur_fragment =
+                    pred_bucket_for_fragment_to_be_swapped_out;
+              } else {  // The fragment at the write offset is already in the
+                        // right bucket
+                bucket_start_off[pred_bucket_for_cur_fragment] +=
+                    SECONDARY_FRAGMENT_CAPACITY;
+              }
+
+              // Decrement the fragment index so that the newly swapped in
+              // fragment is not skipped over
+              --fragment_idx;
+            }
+          }
+        }
+
+        // Add the elements remaining in the auxiliary fragments to the buckets
+        // they belong to. This is for when the fragments weren't full and thus
+        // not flushed to the input array
+        for (long bucket_idx = 0; bucket_idx < SECONDARY_FANOUT; ++bucket_idx) {
+          // Set the writing offset to the beggining of the bucket
+          long write_off = bucket_end_offset[bucket_idx - 1];
+          if (bucket_idx == 0) {
+            write_off = 0;
+          }
+
+          // Add the elements left in the auxiliary fragments to the beggining
+          // of the predicted bucket, since it was not full
+          long elm_idx;
+          for (elm_idx = 0; (elm_idx < fragment_sizes[bucket_idx]) &&
+                            (write_off % SECONDARY_FRAGMENT_CAPACITY != 0);
+               ++elm_idx) {
+            primary_bucket_start[write_off++] = fragments[bucket_idx][elm_idx];
+          }
+
+          // Add the remaining elements from the auxiliary fragments to the end
+          // of the bucket it belongs to
+          write_off =
+              bucket_end_offset[bucket_idx] - fragment_sizes[bucket_idx];
+          for (; elm_idx < fragment_sizes[bucket_idx]; ++elm_idx) {
+            primary_bucket_start[write_off + elm_idx] =
+                fragments[bucket_idx][elm_idx];
+            ++bucket_start_off[bucket_idx];
+          }
+        }
+
+        // Cleanup
+        delete[] swap_buffer;
+        delete[] fragments;
+
+        //- - - - - - - - - - - - - - - - - - - - - - - - - - - -  -//
+        //                MODEL-BASED COUNTING SORT                 //
+        //- - - - - - - - - - - - - - - - - - - - - - - - - - - -  -//
+        // Iterate over the secondary buckets
+        for (long secondary_bucket_idx = 0;
+             secondary_bucket_idx < SECONDARY_FANOUT; ++secondary_bucket_idx) {
+          auto secondary_bucket_sz =
+              secondary_bucket_sizes[secondary_bucket_idx];
+          auto secondary_bucket_start_off = num_elms_finalized;
+          auto secondary_bucket_end_off =
+              secondary_bucket_start_off + secondary_bucket_sz;
+
+          // Skip bucket if empty
+          if (secondary_bucket_sz == 0) continue;
+
+          // Check for homogeneity
+          bool is_homogeneous = true;
+          if (rmi.enable_dups_detection) {
+            for (long elm_idx = secondary_bucket_start_off + 1;
+                 elm_idx < secondary_bucket_end_off; ++elm_idx) {
+              if (begin[elm_idx] != begin[elm_idx - 1]) {
+                is_homogeneous = false;
+                break;
+              }
+            }
+          }
+
+          if (!(rmi.enable_dups_detection and is_homogeneous)) {
+            long adjustment_offset =
+                1. *
+                (primary_bucket_idx * SECONDARY_FANOUT + secondary_bucket_idx) *
+                input_sz / (PRIMARY_FANOUT * SECONDARY_FANOUT);
+
+            // Saves the predicted CDFs for the Counting Sort subroutine
+            vector<long> pred_cache_cs(secondary_bucket_sz);
+
+            // Count array for the model-enhanced counting sort subroutine
+            vector<long> cnt_hist(secondary_bucket_sz, 0);
+
+            /*
+             * OPTIMIZATION
+             * We check to see if the first and last element in the current
+             * bucket used the same leaf model to obtain their CDF. If that is
+             * the case, then we don't need to traverse the CDF model for
+             * every element in this bucket, hence decreasing the inference
+             * complexity from O(num_layer) to O(1).
+             */
+
+            long pred_model_first_elm = static_cast<long>(std::max(
+                0., std::min(num_leaf_models - 1.,
+                             root_slope * begin[secondary_bucket_start_off] +
+                                 root_intercept)));
+
+            long pred_model_last_elm = static_cast<long>(std::max(
+                0., std::min(num_leaf_models - 1.,
+                             root_slope * begin[secondary_bucket_end_off - 1] +
+                                 root_intercept)));
+
+            if (pred_model_first_elm == pred_model_last_elm) {
+              // Avoid CDF model traversal and predict the CDF only using the
+              // leaf model
+
+              // Iterate over the elements and place them into the secondary
+              // buckets
+              for (long elm_idx = 0; elm_idx < secondary_bucket_sz; ++elm_idx) {
+                // Find the current element
+                auto cur_key = begin[secondary_bucket_start_off + elm_idx];
+
+                // Predict the CDF
+                double pred_cdf = slopes[pred_model_first_elm] * cur_key +
+                                  intercepts[pred_model_first_elm];
+
+                // Scale the predicted CDF to the input size and save it
+                pred_cache_cs[elm_idx] = static_cast<long>(std::max(
+                    0., std::min(secondary_bucket_sz - 1.,
+                                 (pred_cdf * input_sz) - adjustment_offset)));
+
+                // Update the counts
+                ++cnt_hist[pred_cache_cs[elm_idx]];
+              }
+            } else {
+              // Fully traverse the CDF model again to predict the CDF of the
+              // current element
+
+              // Iterate over the elements and place them into the minor
+              // buckets
+              for (long elm_idx = 0; elm_idx < secondary_bucket_sz; ++elm_idx) {
+                // Find the current element
+                auto cur_key = begin[secondary_bucket_start_off + elm_idx];
+
+                // Predict the model idx in the leaf layer
+                auto model_idx_next_layer = static_cast<long>(std::max(
+                    0., std::min(num_leaf_models - 1.,
+                                 root_slope * cur_key + root_intercept)));
+                // Predict the CDF
+                double pred_cdf = slopes[model_idx_next_layer] * cur_key +
+                                  intercepts[model_idx_next_layer];
+
+                // Scale the predicted CDF to the input size and save it
+                pred_cache_cs[elm_idx] = static_cast<long>(std::max(
+                    0., std::min(secondary_bucket_sz - 1.,
+                                 (pred_cdf * input_sz) - adjustment_offset)));
+
+                // Update the counts
+                ++cnt_hist[pred_cache_cs[elm_idx]];
+              }
+            }
+
+            --cnt_hist[0];
+
+            // Calculate the running totals
+            for (long i = 1; i < secondary_bucket_sz; ++i) {
+              cnt_hist[i] += cnt_hist[i - 1];
+            }
+
+            // Allocate a temporary buffer for placing the keys in sorted
+            // order
+            vector<T> tmp(secondary_bucket_sz);
+
+            // Re-shuffle the elms based on the calculated cumulative counts
+            for (long elm_idx = 0; elm_idx < secondary_bucket_sz; ++elm_idx) {
+              // Place the element in the predicted position in the array
+
+              tmp[cnt_hist[pred_cache_cs[elm_idx]]] =
+                  begin[secondary_bucket_start_off + elm_idx];
+
+              // Update counts
+              --cnt_hist[pred_cache_cs[elm_idx]];
+            }
+
+            // Write back the temprorary buffer to the original input
+            std::copy(tmp.begin(), tmp.end(),
+                      begin + secondary_bucket_start_off);
+          }
+          // Update the number of finalized elements
+          num_elms_finalized += secondary_bucket_sz;
+        }  // end of iteration over the secondary buckets
+
+      }  // end of processing for non-flagged, non-homogeneous primary buckets
+    }    // end of iteration over primary buckets
+  }
+  // clock_gettime(CLOCK_REALTIME, &ts2);
+  // printf("SECOND ROUND OF PARTITIONING %ld us\n",
+  //        ToNanoseconds(SubtractTime(ts2, ts1)) / 1000);
+
+  // Touch up
+  // clock_gettime(CLOCK_REALTIME, &ts1);
+  learned_sort::utils::insertion_sort(begin, end);
+  // clock_gettime(CLOCK_REALTIME, &ts2);
+  // printf("INSERTION SORT %ld us\n",
+  //        ToNanoseconds(SubtractTime(ts2, ts1)) / 1000);
+}
+
+/**
+ * @brief Sorts a sequence of numerical keys from [begin, end) using Learned
+ * Sort, in ascending order.
+ *
+ * @tparam RandomIt A bi-directional random iterator over the sequence of keys
+ * @param begin Random-access iterators to the initial position of the
+ * sequence to be used for sorting. The range used is [begin,end), which
+ * contains all the elements between first and last, including the element
+ * pointed by first but not the element pointed by last.
+ * @param end Random-access iterators to the last position of the sequence to
+ * be used for sorting. The range used is [begin,end), which contains all the
+ * elements between first and last, including the element pointed by first but
+ * not the element pointed by last.
+ * @param params The hyperparameters for the CDF model, which describe the
+ * architecture and sampling ratio.
+ */
+template <class RandomIt>
+void sort(
+    RandomIt begin, RandomIt end,
+    typename TwoLayerRMI<typename iterator_traits<RandomIt>::value_type>::Params
+        &params) {
+  // Check if the data is already sorted
+  // struct timespec ts1, ts2;
+  // clock_gettime(CLOCK_REALTIME, &ts1);
+  if (*(end - 1) >= *begin && std::is_sorted(begin, end)) {
+    return;
+  }
+  // clock_gettime(CLOCK_REALTIME, &ts2);
+  // printf("IS SORTED %ld us\n", ToNanoseconds(SubtractTime(ts2, ts1)) / 1000);
+
+  // Check if the data is sorted in descending order
+  // clock_gettime(CLOCK_REALTIME, &ts1);
+  if (*(end - 1) <= *begin) {
+    auto is_reverse_sorted = true;
+
+    for (auto i = begin; i != end - 1; ++i) {
+      if (*i < *(i + 1)) {
+        is_reverse_sorted = false;
+      }
+    }
+
+    if (is_reverse_sorted) {
+      std::reverse(begin, end);
+      return;
+    }
+  }
+  // clock_gettime(CLOCK_REALTIME, &ts2);
+  // printf("CHECK ORDER %ld us\n", ToNanoseconds(SubtractTime(ts2, ts1)) / 1000);
+
+  if (std::distance(begin, end) <=
+      std::max<long>(params.fanout * params.threshold,
+                     5 * params.num_leaf_models)) {
+    std::sort(begin, end);
+  } else {
+    // Initialize the RMI
+    TwoLayerRMI<typename iterator_traits<RandomIt>::value_type> rmi(params);
+
+    // Check if the model can be trained
+    // clock_gettime(CLOCK_REALTIME, &ts1);
+    if (rmi.train(begin, end)) {
+      // clock_gettime(CLOCK_REALTIME, &ts2);
+      // printf("TRAIN %ld us\n", ToNanoseconds(SubtractTime(ts2, ts1)) / 1000);
+      // Sort the data if the model was successfully trained
+      learned_sort::sort(begin, end, rmi);
+    }
+
+    else {  // Fall back in case the model could not be trained
+      std::sort(begin, end);
+    }
+  }
+}
+
+/**
+ * @brief Sorts a sequence of numerical keys from [begin, end) using Learned
+ * Sort, in ascending order.
+ *
+ * @tparam RandomIt A bi-directional random iterator over the sequence of keys
+ * @param begin Random-access iterators to the initial position of the
+ * sequence to be used for sorting. The range used is [begin,end), which
+ * contains all the elements between first and last, including the element
+ * pointed by first but not the element pointed by last.
+ * @param end Random-access iterators to the last position of the sequence to
+ * be used for sorting. The range used is [begin,end), which contains all the
+ * elements between first and last, including the element pointed by first but
+ * not the element pointed by last.
+ */
+template <class RandomIt>
+void sort(RandomIt begin, RandomIt end) {
+  if (begin != end) {
+    typename TwoLayerRMI<typename iterator_traits<RandomIt>::value_type>::Params
+        p;
+    learned_sort::sort(begin, end, p);
+  }
+}
+
+}  // namespace learned_sort
+
+#endif /* __cplusplus */
+COSMOPOLITAN_C_START_
+
+void learned_sort_int64(int64_t *, size_t);
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_H_ */
diff --git a/third_party/libcxx/libcxx.mk b/third_party/libcxx/libcxx.mk
index ff1ab6b55..c6103d788 100644
--- a/third_party/libcxx/libcxx.mk
+++ b/third_party/libcxx/libcxx.mk
@@ -8,6 +8,11 @@ THIRD_PARTY_LIBCXX = $(THIRD_PARTY_LIBCXX_A_DEPS) $(THIRD_PARTY_LIBCXX_A)
 THIRD_PARTY_LIBCXX_A = o/$(MODE)/third_party/libcxx/libcxx.a
 
 THIRD_PARTY_LIBCXX_A_HDRS =					\
+	third_party/libcxx/rmi.h				\
+	third_party/libcxx/utils.h				\
+	third_party/libcxx/learned_sort.h			\
+	third_party/libcxx/pdqsort.h				\
+	third_party/libcxx/ska_sort.h				\
 	third_party/libcxx/__bit_reference			\
 	third_party/libcxx/__bsd_locale_fallbacks.h		\
 	third_party/libcxx/__config				\
@@ -138,6 +143,9 @@ THIRD_PARTY_LIBCXX_A_HDRS =					\
 	third_party/libcxx/wctype.h
 
 THIRD_PARTY_LIBCXX_A_SRCS_CC =					\
+	third_party/libcxx/learned_sort.cc			\
+	third_party/libcxx/pdqsort.cc				\
+	third_party/libcxx/ska_sort.cc				\
 	third_party/libcxx/algorithm.cc				\
 	third_party/libcxx/charconv.cc				\
 	third_party/libcxx/chrono.cc				\
diff --git a/third_party/libcxx/pdqsort.cc b/third_party/libcxx/pdqsort.cc
new file mode 100644
index 000000000..c2bc41fc3
--- /dev/null
+++ b/third_party/libcxx/pdqsort.cc
@@ -0,0 +1,25 @@
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8                              :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/libcxx/pdqsort.h"
+
+void pdqsort_int64(int64_t* A, size_t n) { pdqsort(A, A + n); }
+
+void pdqsort_branchless_int64(int64_t* A, size_t n) {
+  pdqsort_branchless(A, A + n);
+}
diff --git a/third_party/libcxx/pdqsort.h b/third_party/libcxx/pdqsort.h
new file mode 100644
index 000000000..169e49dc2
--- /dev/null
+++ b/third_party/libcxx/pdqsort.h
@@ -0,0 +1,552 @@
+/* -*- c++ -*- */
+/* clang-format off */
+/*
+    pdqsort.h - Pattern-defeating quicksort.
+
+    Copyright (c) 2015 Orson Peters
+
+    This software is provided 'as-is', without any express or implied warranty. In no event will the
+    authors be held liable for any damages arising from the use of this software.
+
+    Permission is granted to anyone to use this software for any purpose, including commercial
+    applications, and to alter it and redistribute it freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not claim that you wrote the
+       original software. If you use this software in a product, an acknowledgment in the product
+       documentation would be appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be misrepresented as
+       being the original software.
+
+    3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef PDQSORT_H
+#define PDQSORT_H
+#ifdef __cplusplus
+#include "third_party/libcxx/algorithm"
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/functional"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/iterator"
+
+#if __cplusplus >= 201103L
+    #include "third_party/libcxx/cstdint"
+    #include "third_party/libcxx/type_traits"
+    #define PDQSORT_PREFER_MOVE(x) std::move(x)
+#else
+    #define PDQSORT_PREFER_MOVE(x) (x)
+#endif
+
+
+namespace pdqsort_detail {
+    enum {
+        // Partitions below this size are sorted using insertion sort.
+        insertion_sort_threshold = 24,
+
+        // Partitions above this size use Tukey's ninther to select the pivot.
+        ninther_threshold = 128,
+
+        // When we detect an already sorted partition, attempt an insertion sort that allows this
+        // amount of element moves before giving up.
+        partial_insertion_sort_limit = 8,
+
+        // Must be multiple of 8 due to loop unrolling, and < 256 to fit in unsigned char.
+        block_size = 64,
+
+        // Cacheline size, assumes power of two.
+        cacheline_size = 64
+
+    };
+
+#if __cplusplus >= 201103L
+    template<class T> struct is_default_compare : std::false_type { };
+    template<class T> struct is_default_compare<std::less<T> > : std::true_type { };
+    template<class T> struct is_default_compare<std::greater<T> > : std::true_type { };
+#endif
+
+    // Returns floor(log2(n)), assumes n > 0.
+    template<class T>
+    inline int log2(T n) {
+        int log = 0;
+        while (n >>= 1) ++log;
+        return log;
+    }
+
+    // Sorts [begin, end) using insertion sort with the given comparison function.
+    template<class Iter, class Compare>
+    inline void insertion_sort(Iter begin, Iter end, Compare comp) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+        if (begin == end) return;
+
+        for (Iter cur = begin + 1; cur != end; ++cur) {
+            Iter sift = cur;
+            Iter sift_1 = cur - 1;
+
+            // Compare first so we can avoid 2 moves for an element already positioned correctly.
+            if (comp(*sift, *sift_1)) {
+                T tmp = PDQSORT_PREFER_MOVE(*sift);
+
+                do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
+                while (sift != begin && comp(tmp, *--sift_1));
+
+                *sift = PDQSORT_PREFER_MOVE(tmp);
+            }
+        }
+    }
+
+    // Sorts [begin, end) using insertion sort with the given comparison function. Assumes
+    // *(begin - 1) is an element smaller than or equal to any element in [begin, end).
+    template<class Iter, class Compare>
+    inline void unguarded_insertion_sort(Iter begin, Iter end, Compare comp) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+        if (begin == end) return;
+
+        for (Iter cur = begin + 1; cur != end; ++cur) {
+            Iter sift = cur;
+            Iter sift_1 = cur - 1;
+
+            // Compare first so we can avoid 2 moves for an element already positioned correctly.
+            if (comp(*sift, *sift_1)) {
+                T tmp = PDQSORT_PREFER_MOVE(*sift);
+
+                do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
+                while (comp(tmp, *--sift_1));
+
+                *sift = PDQSORT_PREFER_MOVE(tmp);
+            }
+        }
+    }
+
+    // Attempts to use insertion sort on [begin, end). Will return false if more than
+    // partial_insertion_sort_limit elements were moved, and abort sorting. Otherwise it will
+    // successfully sort and return true.
+    template<class Iter, class Compare>
+    inline bool partial_insertion_sort(Iter begin, Iter end, Compare comp) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+        if (begin == end) return true;
+        
+        std::size_t limit = 0;
+        for (Iter cur = begin + 1; cur != end; ++cur) {
+            Iter sift = cur;
+            Iter sift_1 = cur - 1;
+
+            // Compare first so we can avoid 2 moves for an element already positioned correctly.
+            if (comp(*sift, *sift_1)) {
+                T tmp = PDQSORT_PREFER_MOVE(*sift);
+
+                do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
+                while (sift != begin && comp(tmp, *--sift_1));
+
+                *sift = PDQSORT_PREFER_MOVE(tmp);
+                limit += cur - sift;
+            }
+            
+            if (limit > partial_insertion_sort_limit) return false;
+        }
+
+        return true;
+    }
+
+    template<class Iter, class Compare>
+    inline void sort2(Iter a, Iter b, Compare comp) {
+        if (comp(*b, *a)) std::iter_swap(a, b);
+    }
+
+    // Sorts the elements *a, *b and *c using comparison function comp.
+    template<class Iter, class Compare>
+    inline void sort3(Iter a, Iter b, Iter c, Compare comp) {
+        sort2(a, b, comp);
+        sort2(b, c, comp);
+        sort2(a, b, comp);
+    }
+
+    template<class T>
+    inline T* align_cacheline(T* p) {
+#if defined(UINTPTR_MAX) && __cplusplus >= 201103L
+        std::uintptr_t ip = reinterpret_cast<std::uintptr_t>(p);
+#else
+        std::size_t ip = reinterpret_cast<std::size_t>(p);
+#endif
+        ip = (ip + cacheline_size - 1) & -cacheline_size;
+        return reinterpret_cast<T*>(ip);
+    }
+
+    template<class Iter>
+    inline void swap_offsets(Iter first, Iter last,
+                             unsigned char* offsets_l, unsigned char* offsets_r,
+                             int num, bool use_swaps) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+        if (use_swaps) {
+            // This case is needed for the descending distribution, where we need
+            // to have proper swapping for pdqsort to remain O(n).
+            for (int i = 0; i < num; ++i) {
+                std::iter_swap(first + offsets_l[i], last - offsets_r[i]);
+            }
+        } else if (num > 0) {
+            Iter l = first + offsets_l[0]; Iter r = last - offsets_r[0];
+            T tmp(PDQSORT_PREFER_MOVE(*l)); *l = PDQSORT_PREFER_MOVE(*r);
+            for (int i = 1; i < num; ++i) {
+                l = first + offsets_l[i]; *r = PDQSORT_PREFER_MOVE(*l);
+                r = last - offsets_r[i]; *l = PDQSORT_PREFER_MOVE(*r);
+            }
+            *r = PDQSORT_PREFER_MOVE(tmp);
+        }
+    }
+
+    // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
+    // to the pivot are put in the right-hand partition. Returns the position of the pivot after
+    // partitioning and whether the passed sequence already was correctly partitioned. Assumes the
+    // pivot is a median of at least 3 elements and that [begin, end) is at least
+    // insertion_sort_threshold long. Uses branchless partitioning.
+    template<class Iter, class Compare>
+    inline std::pair<Iter, bool> partition_right_branchless(Iter begin, Iter end, Compare comp) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+
+        // Move pivot into local for speed.
+        T pivot(PDQSORT_PREFER_MOVE(*begin));
+        Iter first = begin;
+        Iter last = end;
+
+        // Find the first element greater than or equal than the pivot (the median of 3 guarantees
+        // this exists).
+        while (comp(*++first, pivot));
+
+        // Find the first element strictly smaller than the pivot. We have to guard this search if
+        // there was no element before *first.
+        if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
+        else                    while (                !comp(*--last, pivot));
+
+        // If the first pair of elements that should be swapped to partition are the same element,
+        // the passed in sequence already was correctly partitioned.
+        bool already_partitioned = first >= last;
+        if (!already_partitioned) {
+            std::iter_swap(first, last);
+            ++first;
+        }
+
+        // The following branchless partitioning is derived from "BlockQuicksort: How Branch
+        // Mispredictions don’t affect Quicksort" by Stefan Edelkamp and Armin Weiss.
+        unsigned char offsets_l_storage[block_size + cacheline_size];
+        unsigned char offsets_r_storage[block_size + cacheline_size];
+        unsigned char* offsets_l = align_cacheline(offsets_l_storage);
+        unsigned char* offsets_r = align_cacheline(offsets_r_storage);
+        int num_l, num_r, start_l, start_r;
+        num_l = num_r = start_l = start_r = 0;
+        
+        while (last - first > 2 * block_size) {
+            // Fill up offset blocks with elements that are on the wrong side.
+            if (num_l == 0) {
+                start_l = 0;
+                Iter it = first;
+                for (unsigned char i = 0; i < block_size;) {
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                }
+            }
+            if (num_r == 0) {
+                start_r = 0;
+                Iter it = last;
+                for (unsigned char i = 0; i < block_size;) {
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                }
+            }
+
+            // Swap elements and update block sizes and first/last boundaries.
+            int num = std::min(num_l, num_r);
+            swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r,
+                         num, num_l == num_r);
+            num_l -= num; num_r -= num;
+            start_l += num; start_r += num;
+            if (num_l == 0) first += block_size;
+            if (num_r == 0) last -= block_size;
+        }
+
+        int l_size = 0, r_size = 0;
+        int unknown_left = (int)(last - first) - ((num_r || num_l) ? block_size : 0);
+        if (num_r) {
+            // Handle leftover block by assigning the unknown elements to the other block.
+            l_size = unknown_left;
+            r_size = block_size;
+        } else if (num_l) {
+            l_size = block_size;
+            r_size = unknown_left;
+        } else {
+            // No leftover block, split the unknown elements in two blocks.
+            l_size = unknown_left/2;
+            r_size = unknown_left - l_size;
+        }
+
+        // Fill offset buffers if needed.
+        if (unknown_left && !num_l) {
+            start_l = 0;
+            Iter it = first;
+            for (unsigned char i = 0; i < l_size;) {
+                offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+            }
+        }
+        if (unknown_left && !num_r) {
+            start_r = 0;
+            Iter it = last;
+            for (unsigned char i = 0; i < r_size;) {
+                offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+            }
+        }
+
+        int num = std::min(num_l, num_r);
+        swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r, num, num_l == num_r);
+        num_l -= num; num_r -= num;
+        start_l += num; start_r += num;
+        if (num_l == 0) first += l_size;
+        if (num_r == 0) last -= r_size;
+        
+        // We have now fully identified [first, last)'s proper position. Swap the last elements.
+        if (num_l) {
+            offsets_l += start_l;
+            while (num_l--) std::iter_swap(first + offsets_l[num_l], --last);
+            first = last;
+        }
+        if (num_r) {
+            offsets_r += start_r;
+            while (num_r--) std::iter_swap(last - offsets_r[num_r], first), ++first;
+            last = first;
+        }
+
+        // Put the pivot in the right place.
+        Iter pivot_pos = first - 1;
+        *begin = PDQSORT_PREFER_MOVE(*pivot_pos);
+        *pivot_pos = PDQSORT_PREFER_MOVE(pivot);
+
+        return std::make_pair(pivot_pos, already_partitioned);
+    }
+
+    // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
+    // to the pivot are put in the right-hand partition. Returns the position of the pivot after
+    // partitioning and whether the passed sequence already was correctly partitioned. Assumes the
+    // pivot is a median of at least 3 elements and that [begin, end) is at least
+    // insertion_sort_threshold long.
+    template<class Iter, class Compare>
+    inline std::pair<Iter, bool> partition_right(Iter begin, Iter end, Compare comp) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+        
+        // Move pivot into local for speed.
+        T pivot(PDQSORT_PREFER_MOVE(*begin));
+
+        Iter first = begin;
+        Iter last = end;
+
+        // Find the first element greater than or equal than the pivot (the median of 3 guarantees
+        // this exists).
+        while (comp(*++first, pivot));
+
+        // Find the first element strictly smaller than the pivot. We have to guard this search if
+        // there was no element before *first.
+        if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
+        else                    while (                !comp(*--last, pivot));
+
+        // If the first pair of elements that should be swapped to partition are the same element,
+        // the passed in sequence already was correctly partitioned.
+        bool already_partitioned = first >= last;
+        
+        // Keep swapping pairs of elements that are on the wrong side of the pivot. Previously
+        // swapped pairs guard the searches, which is why the first iteration is special-cased
+        // above.
+        while (first < last) {
+            std::iter_swap(first, last);
+            while (comp(*++first, pivot));
+            while (!comp(*--last, pivot));
+        }
+
+        // Put the pivot in the right place.
+        Iter pivot_pos = first - 1;
+        *begin = PDQSORT_PREFER_MOVE(*pivot_pos);
+        *pivot_pos = PDQSORT_PREFER_MOVE(pivot);
+
+        return std::make_pair(pivot_pos, already_partitioned);
+    }
+
+    // Similar function to the one above, except elements equal to the pivot are put to the left of
+    // the pivot and it doesn't check or return if the passed sequence already was partitioned.
+    // Since this is rarely used (the many equal case), and in that case pdqsort already has O(n)
+    // performance, no block quicksort is applied here for simplicity.
+    template<class Iter, class Compare>
+    inline Iter partition_left(Iter begin, Iter end, Compare comp) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+
+        T pivot(PDQSORT_PREFER_MOVE(*begin));
+        Iter first = begin;
+        Iter last = end;
+        
+        while (comp(pivot, *--last));
+
+        if (last + 1 == end) while (first < last && !comp(pivot, *++first));
+        else                 while (                !comp(pivot, *++first));
+
+        while (first < last) {
+            std::iter_swap(first, last);
+            while (comp(pivot, *--last));
+            while (!comp(pivot, *++first));
+        }
+
+        Iter pivot_pos = last;
+        *begin = PDQSORT_PREFER_MOVE(*pivot_pos);
+        *pivot_pos = PDQSORT_PREFER_MOVE(pivot);
+
+        return pivot_pos;
+    }
+
+
+    template<class Iter, class Compare, bool Branchless>
+    inline void pdqsort_loop(Iter begin, Iter end, Compare comp, int bad_allowed, bool leftmost = true) {
+        typedef typename std::iterator_traits<Iter>::difference_type diff_t;
+
+        // Use a while loop for tail recursion elimination.
+        while (true) {
+            diff_t size = end - begin;
+
+            // Insertion sort is faster for small arrays.
+            if (size < insertion_sort_threshold) {
+                if (leftmost) insertion_sort(begin, end, comp);
+                else unguarded_insertion_sort(begin, end, comp);
+                return;
+            }
+
+            // Choose pivot as median of 3 or pseudomedian of 9.
+            diff_t s2 = size / 2;
+            if (size > ninther_threshold) {
+                sort3(begin, begin + s2, end - 1, comp);
+                sort3(begin + 1, begin + (s2 - 1), end - 2, comp);
+                sort3(begin + 2, begin + (s2 + 1), end - 3, comp);
+                sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp);
+                std::iter_swap(begin, begin + s2);
+            } else sort3(begin + s2, begin, end - 1, comp);
+
+            // If *(begin - 1) is the end of the right partition of a previous partition operation
+            // there is no element in [begin, end) that is smaller than *(begin - 1). Then if our
+            // pivot compares equal to *(begin - 1) we change strategy, putting equal elements in
+            // the left partition, greater elements in the right partition. We do not have to
+            // recurse on the left partition, since it's sorted (all equal).
+            if (!leftmost && !comp(*(begin - 1), *begin)) {
+                begin = partition_left(begin, end, comp) + 1;
+                continue;
+            }
+
+            // Partition and get results.
+            std::pair<Iter, bool> part_result =
+                Branchless ? partition_right_branchless(begin, end, comp)
+                           : partition_right(begin, end, comp);
+            Iter pivot_pos = part_result.first;
+            bool already_partitioned = part_result.second;
+
+            // Check for a highly unbalanced partition.
+            diff_t l_size = pivot_pos - begin;
+            diff_t r_size = end - (pivot_pos + 1);
+            bool highly_unbalanced = l_size < size / 8 || r_size < size / 8;
+
+            // If we got a highly unbalanced partition we shuffle elements to break many patterns.
+            if (highly_unbalanced) {
+                // If we had too many bad partitions, switch to heapsort to guarantee O(n log n).
+                if (--bad_allowed == 0) {
+                    std::make_heap(begin, end, comp);
+                    std::sort_heap(begin, end, comp);
+                    return;
+                }
+
+                if (l_size >= insertion_sort_threshold) {
+                    std::iter_swap(begin,             begin + l_size / 4);
+                    std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4);
+
+                    if (l_size > ninther_threshold) {
+                        std::iter_swap(begin + 1,         begin + (l_size / 4 + 1));
+                        std::iter_swap(begin + 2,         begin + (l_size / 4 + 2));
+                        std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1));
+                        std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2));
+                    }
+                }
+                
+                if (r_size >= insertion_sort_threshold) {
+                    std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4));
+                    std::iter_swap(end - 1,                   end - r_size / 4);
+                    
+                    if (r_size > ninther_threshold) {
+                        std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4));
+                        std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4));
+                        std::iter_swap(end - 2,             end - (1 + r_size / 4));
+                        std::iter_swap(end - 3,             end - (2 + r_size / 4));
+                    }
+                }
+            } else {
+                // If we were decently balanced and we tried to sort an already partitioned
+                // sequence try to use insertion sort.
+                if (already_partitioned && partial_insertion_sort(begin, pivot_pos, comp)
+                                        && partial_insertion_sort(pivot_pos + 1, end, comp)) return;
+            }
+                
+            // Sort the left partition first using recursion and do tail recursion elimination for
+            // the right-hand partition.
+            pdqsort_loop<Iter, Compare, Branchless>(begin, pivot_pos, comp, bad_allowed, leftmost);
+            begin = pivot_pos + 1;
+            leftmost = false;
+        }
+    }
+}
+
+
+template<class Iter, class Compare>
+inline void pdqsort(Iter begin, Iter end, Compare comp) {
+    if (begin == end) return;
+
+#if __cplusplus >= 201103L
+    pdqsort_detail::pdqsort_loop<Iter, Compare,
+        pdqsort_detail::is_default_compare<typename std::decay<Compare>::type>::value &&
+        std::is_arithmetic<typename std::iterator_traits<Iter>::value_type>::value>(
+        begin, end, comp, pdqsort_detail::log2(end - begin));
+#else
+    pdqsort_detail::pdqsort_loop<Iter, Compare, false>(
+        begin, end, comp, pdqsort_detail::log2(end - begin));
+#endif
+}
+
+template<class Iter>
+inline void pdqsort(Iter begin, Iter end) {
+    typedef typename std::iterator_traits<Iter>::value_type T;
+    pdqsort(begin, end, std::less<T>());
+}
+
+template<class Iter, class Compare>
+inline void pdqsort_branchless(Iter begin, Iter end, Compare comp) {
+    if (begin == end) return;
+    pdqsort_detail::pdqsort_loop<Iter, Compare, true>(
+        begin, end, comp, pdqsort_detail::log2(end - begin));
+}
+
+template<class Iter>
+inline void pdqsort_branchless(Iter begin, Iter end) {
+    typedef typename std::iterator_traits<Iter>::value_type T;
+    pdqsort_branchless(begin, end, std::less<T>());
+}
+
+
+#undef PDQSORT_PREFER_MOVE
+#endif /* __cplusplus */
+COSMOPOLITAN_C_START_
+
+void pdqsort_int64(int64_t *, size_t);
+void pdqsort_branchless_int64(int64_t *, size_t);
+
+COSMOPOLITAN_C_END_
+#endif /* PDQSORT_H */
diff --git a/third_party/libcxx/rmi.h b/third_party/libcxx/rmi.h
new file mode 100644
index 000000000..8e389dc5f
--- /dev/null
+++ b/third_party/libcxx/rmi.h
@@ -0,0 +1,308 @@
+// -*- c++ -*-
+// clang-format off
+#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_
+#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_
+#include "third_party/libcxx/iostream"
+#include "third_party/libcxx/vector"
+
+using namespace std;
+
+namespace learned_sort {
+
+// Packs the key and its respective scaled CDF value
+template <typename T>
+struct training_point {
+  T x;
+  double y;
+};
+
+// Represents linear models
+struct linear_model {
+  double slope = 0;
+  double intercept = 0;
+};
+
+// An implementation of a 2-layer RMI model
+template <class T>
+class TwoLayerRMI {
+ public:
+  // CDF model hyperparameters
+  struct Params {
+    // Member fields
+    long fanout;
+    float sampling_rate;
+    long threshold;
+    long num_leaf_models;
+
+    // Default hyperparameters
+    static constexpr long DEFAULT_FANOUT = 1e3;
+    static constexpr float DEFAULT_SAMPLING_RATE = .01;
+    static constexpr long DEFAULT_THRESHOLD = 100;
+    static constexpr long DEFAULT_NUM_LEAF_MODELS = 1000;
+    static constexpr long MIN_SORTING_SIZE = 1e4;
+
+    // Default constructor
+    Params() {
+      this->fanout = DEFAULT_FANOUT;
+      this->sampling_rate = DEFAULT_SAMPLING_RATE;
+      this->threshold = DEFAULT_THRESHOLD;
+      this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS;
+    }
+
+    // Constructor with custom hyperparameter values
+    Params(float sampling_rate, float overallocation, long fanout,
+           long batch_sz, long threshold) {
+      this->fanout = fanout;
+      this->sampling_rate = sampling_rate;
+      this->threshold = threshold;
+      this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS;
+    }
+  };
+
+  // Member variables of the CDF model
+  bool trained;
+  linear_model root_model;
+  vector<linear_model> leaf_models;
+  vector<T> training_sample;
+  Params hp;
+  bool enable_dups_detection;
+
+  // CDF model constructor
+  TwoLayerRMI(Params p) {
+    this->trained = false;
+    this->hp = p;
+    this->leaf_models.resize(p.num_leaf_models);
+    this->enable_dups_detection = true;
+  }
+
+  // Pretty-printing function
+  void print() {
+    printf("[0][0]: slope=%0.5f; intercept=%0.5f;\n", root_model.slope,
+           root_model.intercept);
+    for (int model_idx = 0; model_idx < hp.num_leaf_models; ++model_idx) {
+      printf("[%i][1]: slope=%0.5f; intercept=%0.5f;\n", model_idx,
+             leaf_models[model_idx].slope, leaf_models[model_idx].intercept);
+    }
+    cout << "-----------------------------" << endl;
+  }
+
+  /**
+   * @brief Train a CDF function with an RMI architecture, using linear spline
+   * interpolation.
+   *
+   * @param begin Random-access iterators to the initial position of the
+   * sequence to be used for sorting. The range used is [begin,end), which
+   * contains all the elements between first and last, including the element
+   * pointed by first but not the element pointed by last.
+   * @param end Random-access iterators to the last position of the sequence to
+   * be used for sorting. The range used is [begin,end), which contains all the
+   * elements between first and last, including the element pointed by first but
+   * not the element pointed by last.
+   * @return true if the model was trained successfully, false otherwise.
+   */
+  bool train(typename vector<T>::iterator begin,
+             typename vector<T>::iterator end) {
+    // Determine input size
+    const long INPUT_SZ = std::distance(begin, end);
+
+    // Validate parameters
+    if (this->hp.fanout >= INPUT_SZ) {
+      this->hp.fanout = TwoLayerRMI<T>::Params::DEFAULT_FANOUT;
+      cerr << "\33[93;1mWARNING\33[0m: Invalid fanout. Using default ("
+           << TwoLayerRMI<T>::Params::DEFAULT_FANOUT << ")." << endl;
+    }
+
+    if (this->hp.sampling_rate <= 0 or this->hp.sampling_rate > 1) {
+      this->hp.sampling_rate = TwoLayerRMI<T>::Params::DEFAULT_SAMPLING_RATE;
+      cerr << "\33[93;1mWARNING\33[0m: Invalid sampling rate. Using default ("
+           << TwoLayerRMI<T>::Params::DEFAULT_SAMPLING_RATE << ")." << endl;
+    }
+
+    if (this->hp.threshold <= 0 or this->hp.threshold >= INPUT_SZ or
+        this->hp.threshold >= INPUT_SZ / this->hp.fanout) {
+      this->hp.threshold = TwoLayerRMI<T>::Params::DEFAULT_THRESHOLD;
+      cerr << "\33[93;1mWARNING\33[0m: Invalid threshold. Using default ("
+           << TwoLayerRMI<T>::Params::DEFAULT_THRESHOLD << ")." << endl;
+    }
+
+    // Initialize the CDF model
+    static const long NUM_LAYERS = 2;
+    vector<vector<vector<training_point<T> > > > training_data(NUM_LAYERS);
+    for (long layer_idx = 0; layer_idx < NUM_LAYERS; ++layer_idx) {
+      training_data[layer_idx].resize(hp.num_leaf_models);
+    }
+
+    //----------------------------------------------------------//
+    //                           SAMPLE                         //
+    //----------------------------------------------------------//
+
+    // Determine sample size
+    const long SAMPLE_SZ = std::min<long>(
+        INPUT_SZ, std::max<long>(this->hp.sampling_rate * INPUT_SZ,
+                                 TwoLayerRMI<T>::Params::MIN_SORTING_SIZE));
+
+    // Create a sample array
+    this->training_sample.reserve(SAMPLE_SZ);
+
+    // Start sampling
+    long offset = static_cast<long>(1. * INPUT_SZ / SAMPLE_SZ);
+    for (auto i = begin; i < end; i += offset) {
+      // NOTE:  We don't directly assign SAMPLE_SZ to this->training_sample_sz
+      //        to avoid issues with divisibility
+      this->training_sample.push_back(*i);
+    }
+
+    // Sort the sample using the provided comparison function
+    std::sort(this->training_sample.begin(), this->training_sample.end());
+
+    // Count the number of unique keys
+    auto sample_cpy = this->training_sample;
+    long num_unique_elms = std::distance(
+        sample_cpy.begin(), std::unique(sample_cpy.begin(), sample_cpy.end()));
+
+    // Stop early if the array has very few unique values. We need at least 2
+    // unique training examples per leaf model.
+    if (num_unique_elms < 2 * this->hp.num_leaf_models) {
+      return false;
+    } else if (num_unique_elms > .9 * training_sample.size()) {
+      this->enable_dups_detection = false;
+    }
+
+    //----------------------------------------------------------//
+    //                     TRAIN THE MODELS                     //
+    //----------------------------------------------------------//
+
+    // Populate the training data for the root model
+    for (long i = 0; i < SAMPLE_SZ; ++i) {
+      training_data[0][0].push_back(
+          {this->training_sample[i], 1. * i / SAMPLE_SZ});
+    }
+
+    // Train the root model using linear interpolation
+    auto *current_training_data = &training_data[0][0];
+    linear_model *current_model = &(this->root_model);
+
+    // Find the min and max values in the training set
+    training_point<T> min = current_training_data->front();
+    training_point<T> max = current_training_data->back();
+
+    // Calculate the slope and intercept terms, assuming min.y = 0 and max.y
+    current_model->slope = 1. / (max.x - min.x);
+    current_model->intercept = -current_model->slope * min.x;
+
+    // Extrapolate for the number of models in the next layer
+    current_model->slope *= this->hp.num_leaf_models - 1;
+    current_model->intercept *= this->hp.num_leaf_models - 1;
+
+    // Populate the training data for the next layer
+    for (const auto &d : *current_training_data) {
+      // Predict the model index in next layer
+      long rank = current_model->slope * d.x + current_model->intercept;
+
+      // Normalize the rank between 0 and the number of models in the next layer
+      rank = std::max(static_cast<long>(0),
+                      std::min(this->hp.num_leaf_models - 1, rank));
+
+      // Place the data in the predicted training bucket
+      training_data[1][rank].push_back(d);
+    }
+
+    // Train the leaf models
+    for (long model_idx = 0; model_idx < this->hp.num_leaf_models;
+         ++model_idx) {
+      // Update iterator variables
+      current_training_data = &training_data[1][model_idx];
+      current_model = &(this->leaf_models[model_idx]);
+
+      // Interpolate the min points in the training buckets
+      if (model_idx == 0) {
+        // The current model is the first model in the current layer
+
+        if (current_training_data->size() < 2) {
+          // Case 1: The first model in this layer is empty
+          current_model->slope = 0;
+          current_model->intercept = 0;
+
+          // Insert a fictive training point to avoid propagating more than one
+          // empty initial models.
+          training_point<T> tp;
+          tp.x = 0;
+          tp.y = 0;
+          current_training_data->push_back(tp);
+        } else {
+          // Case 2: The first model in this layer is not empty
+
+          min = current_training_data->front();
+          max = current_training_data->back();
+
+          // Hallucinating as if min.y = 0
+          current_model->slope = (1. * max.y) / (max.x - min.x);
+          current_model->intercept = min.y - current_model->slope * min.x;
+        }
+      } else if (model_idx == this->hp.num_leaf_models - 1) {
+        if (current_training_data->empty()) {
+          // Case 3: The final model in this layer is empty
+
+          current_model->slope = 0;
+          current_model->intercept = 1;
+        } else {
+          // Case 4: The last model in this layer is not empty
+
+          min = training_data[1][model_idx - 1].back();
+          max = current_training_data->back();
+
+          // Hallucinating as if max.y = 1
+          current_model->slope = (1. - min.y) / (max.x - min.x);
+          current_model->intercept = min.y - current_model->slope * min.x;
+        }
+      } else {
+        // The current model is not the first model in the current layer
+
+        if (current_training_data->empty()) {
+          // Case 5: The intermediate model in this layer is empty
+          current_model->slope = 0;
+          current_model->intercept = training_data[1][model_idx - 1]
+                                         .back()
+                                         .y;  // If the previous model
+                                              // was empty too, it will
+                                              // use the fictive
+                                              // training points
+
+          // Insert a fictive training point to avoid propagating more than one
+          // empty initial models.
+          // NOTE: This will _NOT_ throw to DIV/0 due to identical x's and y's
+          // because it is working backwards.
+          training_point<T> tp;
+          tp.x = training_data[1][model_idx - 1].back().x;
+          tp.y = training_data[1][model_idx - 1].back().y;
+          current_training_data->push_back(tp);
+        } else {
+          // Case 6: The intermediate leaf model is not empty
+
+          min = training_data[1][model_idx - 1].back();
+          max = current_training_data->back();
+
+          current_model->slope = (max.y - min.y) / (max.x - min.x);
+          current_model->intercept = min.y - current_model->slope * min.x;
+        }
+      }
+    }
+
+    // NOTE:
+    // The last stage (layer) of this model contains weights that predict the
+    // CDF of the keys (i.e. Range is [0-1]) When using this model to predict
+    // the position of the keys in the sorted order, you MUST scale the weights
+    // of the last layer to whatever range you are predicting for. The inner
+    // layers of the model have already been extrapolated to the length of the
+    // stage.git
+    //
+    // This is a design choice to help with the portability of the model.
+    //
+    this->trained = true;
+
+    return true;
+  }
+};
+}  // namespace learned_sort
+
+#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_ */
diff --git a/third_party/libcxx/ska_sort.cc b/third_party/libcxx/ska_sort.cc
new file mode 100644
index 000000000..43b70a7d2
--- /dev/null
+++ b/third_party/libcxx/ska_sort.cc
@@ -0,0 +1,22 @@
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8                              :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/libcxx/ska_sort.h"
+#include "third_party/libcxx/ska_sort.h"
+
+void ska_sort_int64(int64_t* A, size_t n) { ska_sort(A, A + n); }
diff --git a/third_party/libcxx/ska_sort.h b/third_party/libcxx/ska_sort.h
new file mode 100644
index 000000000..7b808bd4b
--- /dev/null
+++ b/third_party/libcxx/ska_sort.h
@@ -0,0 +1,1457 @@
+// -*- c++ -*-
+// clang-format off
+//
+//          Copyright Malte Skarupke 2016.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See http://www.boost.org/LICENSE_1_0.txt)
+//
+#ifndef COSMOPOLITAN_LIBC_MEM_SKA_SORT_H_
+#define COSMOPOLITAN_LIBC_MEM_SKA_SORT_H_
+#ifdef __cplusplus
+#include "third_party/libcxx/algorithm"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/tuple"
+#include "third_party/libcxx/type_traits"
+#include "third_party/libcxx/utility"
+
+namespace detail
+{
+template<typename count_type, typename It, typename OutIt, typename ExtractKey>
+void counting_sort_impl(It begin, It end, OutIt out_begin, ExtractKey && extract_key)
+{
+    count_type counts[256] = {};
+    for (It it = begin; it != end; ++it)
+    {
+        ++counts[extract_key(*it)];
+    }
+    count_type total = 0;
+    for (count_type & count : counts)
+    {
+        count_type old_count = count;
+        count = total;
+        total += old_count;
+    }
+    for (; begin != end; ++begin)
+    {
+        std::uint8_t key = extract_key(*begin);
+        out_begin[counts[key]++] = std::move(*begin);
+    }
+}
+template<typename It, typename OutIt, typename ExtractKey>
+void counting_sort_impl(It begin, It end, OutIt out_begin, ExtractKey && extract_key)
+{
+    counting_sort_impl<std::uint64_t>(begin, end, out_begin, extract_key);
+}
+inline bool to_unsigned_or_bool(bool b)
+{
+    return b;
+}
+inline unsigned char to_unsigned_or_bool(unsigned char c)
+{
+    return c;
+}
+inline unsigned char to_unsigned_or_bool(signed char c)
+{
+    return static_cast<unsigned char>(c) + 128;
+}
+inline unsigned char to_unsigned_or_bool(char c)
+{
+    return static_cast<unsigned char>(c);
+}
+inline std::uint16_t to_unsigned_or_bool(char16_t c)
+{
+    return static_cast<std::uint16_t>(c);
+}
+inline std::uint32_t to_unsigned_or_bool(char32_t c)
+{
+    return static_cast<std::uint32_t>(c);
+}
+inline std::uint32_t to_unsigned_or_bool(wchar_t c)
+{
+    return static_cast<std::uint32_t>(c);
+}
+inline unsigned short to_unsigned_or_bool(short i)
+{
+    return static_cast<unsigned short>(i) + static_cast<unsigned short>(1 << (sizeof(short) * 8 - 1));
+}
+inline unsigned short to_unsigned_or_bool(unsigned short i)
+{
+    return i;
+}
+inline unsigned int to_unsigned_or_bool(int i)
+{
+    return static_cast<unsigned int>(i) + static_cast<unsigned int>(1 << (sizeof(int) * 8 - 1));
+}
+inline unsigned int to_unsigned_or_bool(unsigned int i)
+{
+    return i;
+}
+inline unsigned long to_unsigned_or_bool(long l)
+{
+    return static_cast<unsigned long>(l) + static_cast<unsigned long>(1l << (sizeof(long) * 8 - 1));
+}
+inline unsigned long to_unsigned_or_bool(unsigned long l)
+{
+    return l;
+}
+inline unsigned long long to_unsigned_or_bool(long long l)
+{
+    return static_cast<unsigned long long>(l) + static_cast<unsigned long long>(1ll << (sizeof(long long) * 8 - 1));
+}
+inline unsigned long long to_unsigned_or_bool(unsigned long long l)
+{
+    return l;
+}
+inline std::uint32_t to_unsigned_or_bool(float f)
+{
+    union
+    {
+        float f;
+        std::uint32_t u;
+    } as_union = { f };
+    std::uint32_t sign_bit = -std::int32_t(as_union.u >> 31);
+    return as_union.u ^ (sign_bit | 0x80000000);
+}
+inline std::uint64_t to_unsigned_or_bool(double f)
+{
+    union
+    {
+        double d;
+        std::uint64_t u;
+    } as_union = { f };
+    std::uint64_t sign_bit = -std::int64_t(as_union.u >> 63);
+    return as_union.u ^ (sign_bit | 0x8000000000000000);
+}
+template<typename T>
+inline size_t to_unsigned_or_bool(T * ptr)
+{
+    return reinterpret_cast<size_t>(ptr);
+}
+
+template<size_t>
+struct SizedRadixSorter;
+
+template<>
+struct SizedRadixSorter<1>
+{
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key)
+    {
+        counting_sort_impl(begin, end, buffer_begin, [&](auto && o)
+        {
+            return to_unsigned_or_bool(extract_key(o));
+        });
+        return true;
+    }
+
+    static constexpr size_t pass_count = 2;
+};
+template<>
+struct SizedRadixSorter<2>
+{
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key)
+    {
+        std::ptrdiff_t num_elements = end - begin;
+        if (num_elements <= (1ll << 32))
+            return sort_inline<uint32_t>(begin, end, buffer_begin, buffer_begin + num_elements, extract_key);
+        else
+            return sort_inline<uint64_t>(begin, end, buffer_begin, buffer_begin + num_elements, extract_key);
+    }
+
+    template<typename count_type, typename It, typename OutIt, typename ExtractKey>
+    static bool sort_inline(It begin, It end, OutIt out_begin, OutIt out_end, ExtractKey && extract_key)
+    {
+        count_type counts0[256] = {};
+        count_type counts1[256] = {};
+
+        for (It it = begin; it != end; ++it)
+        {
+            uint16_t key = to_unsigned_or_bool(extract_key(*it));
+            ++counts0[key & 0xff];
+            ++counts1[(key >> 8) & 0xff];
+        }
+        count_type total0 = 0;
+        count_type total1 = 0;
+        for (int i = 0; i < 256; ++i)
+        {
+            count_type old_count0 = counts0[i];
+            count_type old_count1 = counts1[i];
+            counts0[i] = total0;
+            counts1[i] = total1;
+            total0 += old_count0;
+            total1 += old_count1;
+        }
+        for (It it = begin; it != end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it));
+            out_begin[counts0[key]++] = std::move(*it);
+        }
+        for (OutIt it = out_begin; it != out_end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 8;
+            begin[counts1[key]++] = std::move(*it);
+        }
+        return false;
+    }
+
+    static constexpr size_t pass_count = 3;
+};
+template<>
+struct SizedRadixSorter<4>
+{
+
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key)
+    {
+        std::ptrdiff_t num_elements = end - begin;
+        if (num_elements <= (1ll << 32))
+            return sort_inline<uint32_t>(begin, end, buffer_begin, buffer_begin + num_elements, extract_key);
+        else
+            return sort_inline<uint64_t>(begin, end, buffer_begin, buffer_begin + num_elements, extract_key);
+    }
+    template<typename count_type, typename It, typename OutIt, typename ExtractKey>
+    static bool sort_inline(It begin, It end, OutIt out_begin, OutIt out_end, ExtractKey && extract_key)
+    {
+        count_type counts0[256] = {};
+        count_type counts1[256] = {};
+        count_type counts2[256] = {};
+        count_type counts3[256] = {};
+
+        for (It it = begin; it != end; ++it)
+        {
+            uint32_t key = to_unsigned_or_bool(extract_key(*it));
+            ++counts0[key & 0xff];
+            ++counts1[(key >> 8) & 0xff];
+            ++counts2[(key >> 16) & 0xff];
+            ++counts3[(key >> 24) & 0xff];
+        }
+        count_type total0 = 0;
+        count_type total1 = 0;
+        count_type total2 = 0;
+        count_type total3 = 0;
+        for (int i = 0; i < 256; ++i)
+        {
+            count_type old_count0 = counts0[i];
+            count_type old_count1 = counts1[i];
+            count_type old_count2 = counts2[i];
+            count_type old_count3 = counts3[i];
+            counts0[i] = total0;
+            counts1[i] = total1;
+            counts2[i] = total2;
+            counts3[i] = total3;
+            total0 += old_count0;
+            total1 += old_count1;
+            total2 += old_count2;
+            total3 += old_count3;
+        }
+        for (It it = begin; it != end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it));
+            out_begin[counts0[key]++] = std::move(*it);
+        }
+        for (OutIt it = out_begin; it != out_end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 8;
+            begin[counts1[key]++] = std::move(*it);
+        }
+        for (It it = begin; it != end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 16;
+            out_begin[counts2[key]++] = std::move(*it);
+        }
+        for (OutIt it = out_begin; it != out_end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 24;
+            begin[counts3[key]++] = std::move(*it);
+        }
+        return false;
+    }
+
+    static constexpr size_t pass_count = 5;
+};
+template<>
+struct SizedRadixSorter<8>
+{
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key)
+    {
+        std::ptrdiff_t num_elements = end - begin;
+        if (num_elements <= (1ll << 32))
+            return sort_inline<uint32_t>(begin, end, buffer_begin, buffer_begin + num_elements, extract_key);
+        else
+            return sort_inline<uint64_t>(begin, end, buffer_begin, buffer_begin + num_elements, extract_key);
+    }
+    template<typename count_type, typename It, typename OutIt, typename ExtractKey>
+    static bool sort_inline(It begin, It end, OutIt out_begin, OutIt out_end, ExtractKey && extract_key)
+    {
+        count_type counts0[256] = {};
+        count_type counts1[256] = {};
+        count_type counts2[256] = {};
+        count_type counts3[256] = {};
+        count_type counts4[256] = {};
+        count_type counts5[256] = {};
+        count_type counts6[256] = {};
+        count_type counts7[256] = {};
+
+        for (It it = begin; it != end; ++it)
+        {
+            uint64_t key = to_unsigned_or_bool(extract_key(*it));
+            ++counts0[key & 0xff];
+            ++counts1[(key >> 8) & 0xff];
+            ++counts2[(key >> 16) & 0xff];
+            ++counts3[(key >> 24) & 0xff];
+            ++counts4[(key >> 32) & 0xff];
+            ++counts5[(key >> 40) & 0xff];
+            ++counts6[(key >> 48) & 0xff];
+            ++counts7[(key >> 56) & 0xff];
+        }
+        count_type total0 = 0;
+        count_type total1 = 0;
+        count_type total2 = 0;
+        count_type total3 = 0;
+        count_type total4 = 0;
+        count_type total5 = 0;
+        count_type total6 = 0;
+        count_type total7 = 0;
+        for (int i = 0; i < 256; ++i)
+        {
+            count_type old_count0 = counts0[i];
+            count_type old_count1 = counts1[i];
+            count_type old_count2 = counts2[i];
+            count_type old_count3 = counts3[i];
+            count_type old_count4 = counts4[i];
+            count_type old_count5 = counts5[i];
+            count_type old_count6 = counts6[i];
+            count_type old_count7 = counts7[i];
+            counts0[i] = total0;
+            counts1[i] = total1;
+            counts2[i] = total2;
+            counts3[i] = total3;
+            counts4[i] = total4;
+            counts5[i] = total5;
+            counts6[i] = total6;
+            counts7[i] = total7;
+            total0 += old_count0;
+            total1 += old_count1;
+            total2 += old_count2;
+            total3 += old_count3;
+            total4 += old_count4;
+            total5 += old_count5;
+            total6 += old_count6;
+            total7 += old_count7;
+        }
+        for (It it = begin; it != end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it));
+            out_begin[counts0[key]++] = std::move(*it);
+        }
+        for (OutIt it = out_begin; it != out_end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 8;
+            begin[counts1[key]++] = std::move(*it);
+        }
+        for (It it = begin; it != end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 16;
+            out_begin[counts2[key]++] = std::move(*it);
+        }
+        for (OutIt it = out_begin; it != out_end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 24;
+            begin[counts3[key]++] = std::move(*it);
+        }
+        for (It it = begin; it != end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 32;
+            out_begin[counts4[key]++] = std::move(*it);
+        }
+        for (OutIt it = out_begin; it != out_end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 40;
+            begin[counts5[key]++] = std::move(*it);
+        }
+        for (It it = begin; it != end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 48;
+            out_begin[counts6[key]++] = std::move(*it);
+        }
+        for (OutIt it = out_begin; it != out_end; ++it)
+        {
+            std::uint8_t key = to_unsigned_or_bool(extract_key(*it)) >> 56;
+            begin[counts7[key]++] = std::move(*it);
+        }
+        return false;
+    }
+
+    static constexpr size_t pass_count = 9;
+};
+
+template<typename>
+struct RadixSorter;
+template<>
+struct RadixSorter<bool>
+{
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key)
+    {
+        size_t false_count = 0;
+        for (It it = begin; it != end; ++it)
+        {
+            if (!extract_key(*it))
+                ++false_count;
+        }
+        size_t true_position = false_count;
+        false_count = 0;
+        for (; begin != end; ++begin)
+        {
+            if (extract_key(*begin))
+                buffer_begin[true_position++] = std::move(*begin);
+            else
+                buffer_begin[false_count++] = std::move(*begin);
+        }
+        return true;
+    }
+
+    static constexpr size_t pass_count = 2;
+};
+template<>
+struct RadixSorter<signed char> : SizedRadixSorter<sizeof(signed char)>
+{
+};
+template<>
+struct RadixSorter<unsigned char> : SizedRadixSorter<sizeof(unsigned char)>
+{
+};
+template<>
+struct RadixSorter<signed short> : SizedRadixSorter<sizeof(signed short)>
+{
+};
+template<>
+struct RadixSorter<unsigned short> : SizedRadixSorter<sizeof(unsigned short)>
+{
+};
+template<>
+struct RadixSorter<signed int> : SizedRadixSorter<sizeof(signed int)>
+{
+};
+template<>
+struct RadixSorter<unsigned int> : SizedRadixSorter<sizeof(unsigned int)>
+{
+};
+template<>
+struct RadixSorter<signed long> : SizedRadixSorter<sizeof(signed long)>
+{
+};
+template<>
+struct RadixSorter<unsigned long> : SizedRadixSorter<sizeof(unsigned long)>
+{
+};
+template<>
+struct RadixSorter<signed long long> : SizedRadixSorter<sizeof(signed long long)>
+{
+};
+template<>
+struct RadixSorter<unsigned long long> : SizedRadixSorter<sizeof(unsigned long long)>
+{
+};
+template<>
+struct RadixSorter<float> : SizedRadixSorter<sizeof(float)>
+{
+};
+template<>
+struct RadixSorter<double> : SizedRadixSorter<sizeof(double)>
+{
+};
+template<>
+struct RadixSorter<char> : SizedRadixSorter<sizeof(char)>
+{
+};
+template<>
+struct RadixSorter<wchar_t> : SizedRadixSorter<sizeof(wchar_t)>
+{
+};
+template<>
+struct RadixSorter<char16_t> : SizedRadixSorter<sizeof(char16_t)>
+{
+};
+template<>
+struct RadixSorter<char32_t> : SizedRadixSorter<sizeof(char32_t)>
+{
+};
+template<typename K, typename V>
+struct RadixSorter<std::pair<K, V> >
+{
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key)
+    {
+        bool first_result = RadixSorter<V>::sort(begin, end, buffer_begin, [&](auto && o)
+        {
+            return extract_key(o).second;
+        });
+        auto extract_first = [&](auto && o)
+        {
+            return extract_key(o).first;
+        };
+
+        if (first_result)
+        {
+            return !RadixSorter<K>::sort(buffer_begin, buffer_begin + (end - begin), begin, extract_first);
+        }
+        else
+        {
+            return RadixSorter<K>::sort(begin, end, buffer_begin, extract_first);
+        }
+    }
+
+    static constexpr size_t pass_count = RadixSorter<K>::pass_count + RadixSorter<V>::pass_count;
+};
+template<typename K, typename V>
+struct RadixSorter<const std::pair<K, V> &>
+{
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key)
+    {
+        bool first_result = RadixSorter<V>::sort(begin, end, buffer_begin, [&](auto && o) -> const V &
+        {
+            return extract_key(o).second;
+        });
+        auto extract_first = [&](auto && o) -> const K &
+        {
+            return extract_key(o).first;
+        };
+
+        if (first_result)
+        {
+            return !RadixSorter<K>::sort(buffer_begin, buffer_begin + (end - begin), begin, extract_first);
+        }
+        else
+        {
+            return RadixSorter<K>::sort(begin, end, buffer_begin, extract_first);
+        }
+    }
+
+    static constexpr size_t pass_count = RadixSorter<K>::pass_count + RadixSorter<V>::pass_count;
+};
+template<size_t I, size_t S, typename Tuple>
+struct TupleRadixSorter
+{
+    using NextSorter = TupleRadixSorter<I + 1, S, Tuple>;
+    using ThisSorter = RadixSorter<typename std::tuple_element<I, Tuple>::type>;
+
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt out_begin, OutIt out_end, ExtractKey && extract_key)
+    {
+        bool which = NextSorter::sort(begin, end, out_begin, out_end, extract_key);
+        auto extract_i = [&](auto && o)
+        {
+            return std::get<I>(extract_key(o));
+        };
+        if (which)
+            return !ThisSorter::sort(out_begin, out_end, begin, extract_i);
+        else
+            return ThisSorter::sort(begin, end, out_begin, extract_i);
+    }
+
+    static constexpr size_t pass_count = ThisSorter::pass_count + NextSorter::pass_count;
+};
+template<size_t I, size_t S, typename Tuple>
+struct TupleRadixSorter<I, S, const Tuple &>
+{
+    using NextSorter = TupleRadixSorter<I + 1, S, const Tuple &>;
+    using ThisSorter = RadixSorter<typename std::tuple_element<I, Tuple>::type>;
+
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt out_begin, OutIt out_end, ExtractKey && extract_key)
+    {
+        bool which = NextSorter::sort(begin, end, out_begin, out_end, extract_key);
+        auto extract_i = [&](auto && o) -> decltype(auto)
+        {
+            return std::get<I>(extract_key(o));
+        };
+        if (which)
+            return !ThisSorter::sort(out_begin, out_end, begin, extract_i);
+        else
+            return ThisSorter::sort(begin, end, out_begin, extract_i);
+    }
+
+    static constexpr size_t pass_count = ThisSorter::pass_count + NextSorter::pass_count;
+};
+template<size_t I, typename Tuple>
+struct TupleRadixSorter<I, I, Tuple>
+{
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It, It, OutIt, OutIt, ExtractKey &&)
+    {
+        return false;
+    }
+
+    static constexpr size_t pass_count = 0;
+};
+template<size_t I, typename Tuple>
+struct TupleRadixSorter<I, I, const Tuple &>
+{
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It, It, OutIt, OutIt, ExtractKey &&)
+    {
+        return false;
+    }
+
+    static constexpr size_t pass_count = 0;
+};
+
+template<typename... Args>
+struct RadixSorter<std::tuple<Args...> >
+{
+    using SorterImpl = TupleRadixSorter<0, sizeof...(Args), std::tuple<Args...> >;
+
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key)
+    {
+        return SorterImpl::sort(begin, end, buffer_begin, buffer_begin + (end - begin), extract_key);
+    }
+
+    static constexpr size_t pass_count = SorterImpl::pass_count;
+};
+
+template<typename... Args>
+struct RadixSorter<const std::tuple<Args...> &>
+{
+    using SorterImpl = TupleRadixSorter<0, sizeof...(Args), const std::tuple<Args...> &>;
+
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key)
+    {
+        return SorterImpl::sort(begin, end, buffer_begin, buffer_begin + (end - begin), extract_key);
+    }
+
+    static constexpr size_t pass_count = SorterImpl::pass_count;
+};
+
+template<typename T, size_t S>
+struct RadixSorter<std::array<T, S> >
+{
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key)
+    {
+        auto buffer_end = buffer_begin + (end - begin);
+        bool which = false;
+        for (size_t i = S; i > 0; --i)
+        {
+            auto extract_i = [&, i = i - 1](auto && o)
+            {
+                return extract_key(o)[i];
+            };
+            if (which)
+                which = !RadixSorter<T>::sort(buffer_begin, buffer_end, begin, extract_i);
+            else
+                which = RadixSorter<T>::sort(begin, end, buffer_begin, extract_i);
+        }
+        return which;
+    }
+
+    static constexpr size_t pass_count = RadixSorter<T>::pass_count * S;
+};
+
+template<typename T>
+struct RadixSorter<const T> : RadixSorter<T>
+{
+};
+template<typename T>
+struct RadixSorter<T &> : RadixSorter<const T &>
+{
+};
+template<typename T>
+struct RadixSorter<T &&> : RadixSorter<T>
+{
+};
+template<typename T>
+struct RadixSorter<const T &> : RadixSorter<T>
+{
+};
+template<typename T>
+struct RadixSorter<const T &&> : RadixSorter<T>
+{
+};
+// these structs serve two purposes
+// 1. they serve as illustration for how to implement the to_radix_sort_key function
+// 2. they help produce better error messages. with these overloads you get the
+//    error message "no matching function for call to to_radix_sort(your_type)"
+//    without these examples, you'd get the error message "to_radix_sort_key was
+//    not declared in this scope" which is a much less useful error message
+struct ExampleStructA { int i; };
+struct ExampleStructB { float f; };
+inline int to_radix_sort_key(ExampleStructA a) { return a.i; }
+inline float to_radix_sort_key(ExampleStructB b) { return b.f; }
+template<typename T, typename Enable = void>
+struct FallbackRadixSorter : RadixSorter<decltype(to_radix_sort_key(std::declval<T>()))>
+{
+    using base = RadixSorter<decltype(to_radix_sort_key(std::declval<T>()))>;
+
+    template<typename It, typename OutIt, typename ExtractKey>
+    static bool sort(It begin, It end, OutIt buffer_begin, ExtractKey && extract_key)
+    {
+        return base::sort(begin, end, buffer_begin, [&](auto && a) -> decltype(auto)
+        {
+            return to_radix_sort_key(extract_key(a));
+        });
+    }
+};
+
+template<typename...>
+struct nested_void
+{
+	using type = void;
+};
+
+template<typename... Args>
+using void_t = typename nested_void<Args...>::type;
+
+template<typename T>
+struct has_subscript_operator_impl
+{
+	template<typename U, typename = decltype(std::declval<U>()[0])>
+	static std::true_type test(int);
+	template<typename>
+	static std::false_type test(...);
+
+	using type = decltype(test<T>(0));
+};
+
+template<typename T>
+using has_subscript_operator = typename has_subscript_operator_impl<T>::type;
+
+
+template<typename T>
+struct FallbackRadixSorter<T, void_t<decltype(to_unsigned_or_bool(std::declval<T>()))> >
+    : RadixSorter<decltype(to_unsigned_or_bool(std::declval<T>()))>
+{
+};
+
+template<typename T>
+struct RadixSorter : FallbackRadixSorter<T>
+{
+};
+
+template<typename T>
+size_t radix_sort_pass_count = RadixSorter<T>::pass_count;
+
+template<typename It, typename Func>
+inline void unroll_loop_four_times(It begin, size_t iteration_count, Func && to_call)
+{
+    size_t loop_count = iteration_count / 4;
+    size_t remainder_count = iteration_count - loop_count * 4;
+    for (; loop_count > 0; --loop_count)
+    {
+        to_call(begin);
+        ++begin;
+        to_call(begin);
+        ++begin;
+        to_call(begin);
+        ++begin;
+        to_call(begin);
+        ++begin;
+    }
+    switch(remainder_count)
+    {
+    case 3:
+        to_call(begin);
+        ++begin;
+    case 2:
+        to_call(begin);
+        ++begin;
+    case 1:
+        to_call(begin);
+    }
+}
+
+template<typename It, typename F>
+inline It custom_std_partition(It begin, It end, F && func)
+{
+    for (;; ++begin)
+    {
+        if (begin == end)
+            return end;
+        if (!func(*begin))
+            break;
+    }
+    It it = begin;
+    for(++it; it != end; ++it)
+    {
+        if (!func(*it))
+            continue;
+
+        std::iter_swap(begin, it);
+        ++begin;
+    }
+    return begin;
+}
+
+struct PartitionInfo
+{
+    PartitionInfo()
+        : count(0)
+    {
+    }
+
+    union
+    {
+        size_t count;
+        size_t offset;
+    };
+    size_t next_offset;
+};
+
+template<size_t>
+struct UnsignedForSize;
+template<>
+struct UnsignedForSize<1>
+{
+    typedef uint8_t type;
+};
+template<>
+struct UnsignedForSize<2>
+{
+    typedef uint16_t type;
+};
+template<>
+struct UnsignedForSize<4>
+{
+    typedef uint32_t type;
+};
+template<>
+struct UnsignedForSize<8>
+{
+    typedef uint64_t type;
+};
+template<typename T>
+struct SubKey;
+template<size_t Size>
+struct SizedSubKey
+{
+    template<typename T>
+    static auto sub_key(T && value, void *)
+    {
+        return to_unsigned_or_bool(value);
+    }
+
+    typedef SubKey<void> next;
+
+    using sub_key_type = typename UnsignedForSize<Size>::type;
+};
+template<typename T>
+struct SubKey<const T> : SubKey<T>
+{
+};
+template<typename T>
+struct SubKey<T &> : SubKey<T>
+{
+};
+template<typename T>
+struct SubKey<T &&> : SubKey<T>
+{
+};
+template<typename T>
+struct SubKey<const T &> : SubKey<T>
+{
+};
+template<typename T>
+struct SubKey<const T &&> : SubKey<T>
+{
+};
+template<typename T, typename Enable = void>
+struct FallbackSubKey
+    : SubKey<decltype(to_radix_sort_key(std::declval<T>()))>
+{
+    using base = SubKey<decltype(to_radix_sort_key(std::declval<T>()))>;
+
+    template<typename U>
+    static decltype(auto) sub_key(U && value, void * data)
+    {
+        return base::sub_key(to_radix_sort_key(value), data);
+    }
+};
+template<typename T>
+struct FallbackSubKey<T, void_t<decltype(to_unsigned_or_bool(std::declval<T>()))> >
+    : SubKey<decltype(to_unsigned_or_bool(std::declval<T>()))>
+{
+};
+template<typename T>
+struct SubKey : FallbackSubKey<T>
+{
+};
+template<>
+struct SubKey<bool>
+{
+    template<typename T>
+    static bool sub_key(T && value, void *)
+    {
+        return value;
+    }
+
+    typedef SubKey<void> next;
+
+    using sub_key_type = bool;
+};
+template<>
+struct SubKey<void>;
+template<>
+struct SubKey<unsigned char> : SizedSubKey<sizeof(unsigned char)>
+{
+};
+template<>
+struct SubKey<unsigned short> : SizedSubKey<sizeof(unsigned short)>
+{
+};
+template<>
+struct SubKey<unsigned int> : SizedSubKey<sizeof(unsigned int)>
+{
+};
+template<>
+struct SubKey<unsigned long> : SizedSubKey<sizeof(unsigned long)>
+{
+};
+template<>
+struct SubKey<unsigned long long> : SizedSubKey<sizeof(unsigned long long)>
+{
+};
+template<typename T>
+struct SubKey<T *> : SizedSubKey<sizeof(T *)>
+{
+};
+template<typename F, typename S, typename Current>
+struct PairSecondSubKey : Current
+{
+    static decltype(auto) sub_key(const std::pair<F, S> & value, void * sort_data)
+    {
+        return Current::sub_key(value.second, sort_data);
+    }
+
+    using next = typename std::conditional<std::is_same<SubKey<void>, typename Current::next>::value, SubKey<void>, PairSecondSubKey<F, S, typename Current::next> >::type;
+};
+template<typename F, typename S, typename Current>
+struct PairFirstSubKey : Current
+{
+    static decltype(auto) sub_key(const std::pair<F, S> & value, void * sort_data)
+    {
+        return Current::sub_key(value.first, sort_data);
+    }
+
+    using next = typename std::conditional<std::is_same<SubKey<void>, typename Current::next>::value, PairSecondSubKey<F, S, SubKey<S> >, PairFirstSubKey<F, S, typename Current::next> >::type;
+};
+template<typename F, typename S>
+struct SubKey<std::pair<F, S> > : PairFirstSubKey<F, S, SubKey<F> >
+{
+};
+template<size_t Index, typename First, typename... More>
+struct TypeAt : TypeAt<Index - 1, More..., void>
+{
+};
+template<typename First, typename... More>
+struct TypeAt<0, First, More...>
+{
+    typedef First type;
+};
+
+template<size_t Index, typename Current, typename First, typename... More>
+struct TupleSubKey;
+
+template<size_t Index, typename Next, typename First, typename... More>
+struct NextTupleSubKey
+{
+    using type = TupleSubKey<Index, Next, First, More...>;
+};
+template<size_t Index, typename First, typename Second, typename... More>
+struct NextTupleSubKey<Index, SubKey<void>, First, Second, More...>
+{
+    using type = TupleSubKey<Index + 1, SubKey<Second>, Second, More...>;
+};
+template<size_t Index, typename First>
+struct NextTupleSubKey<Index, SubKey<void>, First>
+{
+    using type = SubKey<void>;
+};
+
+template<size_t Index, typename Current, typename First, typename... More>
+struct TupleSubKey : Current
+{
+    template<typename Tuple>
+    static decltype(auto) sub_key(const Tuple & value, void * sort_data)
+    {
+        return Current::sub_key(std::get<Index>(value), sort_data);
+    }
+
+    using next = typename NextTupleSubKey<Index, typename Current::next, First, More...>::type;
+};
+template<size_t Index, typename Current, typename First>
+struct TupleSubKey<Index, Current, First> : Current
+{
+    template<typename Tuple>
+    static decltype(auto) sub_key(const Tuple & value, void * sort_data)
+    {
+        return Current::sub_key(std::get<Index>(value), sort_data);
+    }
+
+    using next = typename NextTupleSubKey<Index, typename Current::next, First>::type;
+};
+template<typename First, typename... More>
+struct SubKey<std::tuple<First, More...> > : TupleSubKey<0, SubKey<First>, First, More...>
+{
+};
+
+struct BaseListSortData
+{
+    size_t current_index;
+    size_t recursion_limit;
+    void * next_sort_data;
+};
+template<typename It, typename ExtractKey>
+struct ListSortData : BaseListSortData
+{
+    void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *);
+};
+
+template<typename CurrentSubKey, typename T>
+struct ListElementSubKey : SubKey<typename std::decay<decltype(std::declval<T>()[0])>::type>
+{
+    using base = SubKey<typename std::decay<decltype(std::declval<T>()[0])>::type>;
+
+    using next = ListElementSubKey;
+
+    template<typename U>
+    static decltype(auto) sub_key(U && value, void * sort_data)
+    {
+        BaseListSortData * list_sort_data = static_cast<BaseListSortData *>(sort_data);
+        const T & list = CurrentSubKey::sub_key(value, list_sort_data->next_sort_data);
+        return base::sub_key(list[list_sort_data->current_index], list_sort_data->next_sort_data);
+    }
+};
+
+template<typename T>
+struct ListSubKey
+{
+    using next = SubKey<void>;
+
+    using sub_key_type = T;
+
+    static const T & sub_key(const T & value, void *)
+    {
+        return value;
+    }
+};
+
+template<typename T>
+struct FallbackSubKey<T, typename std::enable_if<has_subscript_operator<T>::value>::type> : ListSubKey<T>
+{
+};
+
+template<typename It, typename ExtractKey>
+inline void StdSortFallback(It begin, It end, ExtractKey & extract_key)
+{
+    std::sort(begin, end, [&](auto && l, auto && r){ return extract_key(l) < extract_key(r); });
+}
+
+template<std::ptrdiff_t StdSortThreshold, typename It, typename ExtractKey>
+inline bool StdSortIfLessThanThreshold(It begin, It end, std::ptrdiff_t num_elements, ExtractKey & extract_key)
+{
+    if (num_elements <= 1)
+        return true;
+    if (num_elements >= StdSortThreshold)
+        return false;
+    StdSortFallback(begin, end, extract_key);
+    return true;
+}
+
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey, typename SubKeyType = typename CurrentSubKey::sub_key_type>
+struct InplaceSorter;
+
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey, size_t NumBytes, size_t Offset = 0>
+struct UnsignedInplaceSorter
+{
+    static constexpr size_t ShiftAmount = (((NumBytes - 1) - Offset) * 8);
+    template<typename T>
+    inline static uint8_t current_byte(T && elem, void * sort_data)
+    {
+        return CurrentSubKey::sub_key(elem, sort_data) >> ShiftAmount;
+    }
+    template<typename It, typename ExtractKey>
+    static void sort(It begin, It end, std::ptrdiff_t num_elements, ExtractKey & extract_key, void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *), void * sort_data)
+    {
+        if (num_elements < AmericanFlagSortThreshold)
+            american_flag_sort(begin, end, extract_key, next_sort, sort_data);
+        else
+            ska_byte_sort(begin, end, extract_key, next_sort, sort_data);
+    }
+
+    template<typename It, typename ExtractKey>
+    static void american_flag_sort(It begin, It end, ExtractKey & extract_key, void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *), void * sort_data)
+    {
+        PartitionInfo partitions[256];
+        for (It it = begin; it != end; ++it)
+        {
+            ++partitions[current_byte(extract_key(*it), sort_data)].count;
+        }
+        size_t total = 0;
+        uint8_t remaining_partitions[256];
+        int num_partitions = 0;
+        for (int i = 0; i < 256; ++i)
+        {
+            size_t count = partitions[i].count;
+            if (!count)
+                continue;
+            partitions[i].offset = total;
+            total += count;
+            partitions[i].next_offset = total;
+            remaining_partitions[num_partitions] = i;
+            ++num_partitions;
+        }
+        if (num_partitions > 1)
+        {
+            uint8_t * current_block_ptr = remaining_partitions;
+            PartitionInfo * current_block = partitions + *current_block_ptr;
+            uint8_t * last_block = remaining_partitions + num_partitions - 1;
+            It it = begin;
+            It block_end = begin + current_block->next_offset;
+            It last_element = end - 1;
+            for (;;)
+            {
+                PartitionInfo * block = partitions + current_byte(extract_key(*it), sort_data);
+                if (block == current_block)
+                {
+                    ++it;
+                    if (it == last_element)
+                        break;
+                    else if (it == block_end)
+                    {
+                        for (;;)
+                        {
+                            ++current_block_ptr;
+                            if (current_block_ptr == last_block)
+                                goto recurse;
+                            current_block = partitions + *current_block_ptr;
+                            if (current_block->offset != current_block->next_offset)
+                                break;
+                        }
+
+                        it = begin + current_block->offset;
+                        block_end = begin + current_block->next_offset;
+                    }
+                }
+                else
+                {
+                    size_t offset = block->offset++;
+                    std::iter_swap(it, begin + offset);
+                }
+            }
+        }
+        recurse:
+        if (Offset + 1 != NumBytes || next_sort)
+        {
+            size_t start_offset = 0;
+            It partition_begin = begin;
+            for (uint8_t * it = remaining_partitions, * end = remaining_partitions + num_partitions; it != end; ++it)
+            {
+                size_t end_offset = partitions[*it].next_offset;
+                It partition_end = begin + end_offset;
+                std::ptrdiff_t num_elements = end_offset - start_offset;
+                if (!StdSortIfLessThanThreshold<StdSortThreshold>(partition_begin, partition_end, num_elements, extract_key))
+                {
+                    UnsignedInplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, NumBytes, Offset + 1>::sort(partition_begin, partition_end, num_elements, extract_key, next_sort, sort_data);
+                }
+                start_offset = end_offset;
+                partition_begin = partition_end;
+            }
+        }
+    }
+
+    template<typename It, typename ExtractKey>
+    static void ska_byte_sort(It begin, It end, ExtractKey & extract_key, void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *), void * sort_data)
+    {
+        PartitionInfo partitions[256];
+        for (It it = begin; it != end; ++it)
+        {
+            ++partitions[current_byte(extract_key(*it), sort_data)].count;
+        }
+        uint8_t remaining_partitions[256];
+        size_t total = 0;
+        int num_partitions = 0;
+        for (int i = 0; i < 256; ++i)
+        {
+            size_t count = partitions[i].count;
+            if (count)
+            {
+                partitions[i].offset = total;
+                total += count;
+                remaining_partitions[num_partitions] = i;
+                ++num_partitions;
+            }
+            partitions[i].next_offset = total;
+        }
+        for (uint8_t * last_remaining = remaining_partitions + num_partitions, * end_partition = remaining_partitions + 1; last_remaining > end_partition;)
+        {
+            last_remaining = custom_std_partition(remaining_partitions, last_remaining, [&](uint8_t partition)
+            {
+                size_t & begin_offset = partitions[partition].offset;
+                size_t & end_offset = partitions[partition].next_offset;
+                if (begin_offset == end_offset)
+                    return false;
+
+                unroll_loop_four_times(begin + begin_offset, end_offset - begin_offset, [partitions = partitions, begin, &extract_key, sort_data](It it)
+                {
+                    uint8_t this_partition = current_byte(extract_key(*it), sort_data);
+                    size_t offset = partitions[this_partition].offset++;
+                    std::iter_swap(it, begin + offset);
+                });
+                return begin_offset != end_offset;
+            });
+        }
+        if (Offset + 1 != NumBytes || next_sort)
+        {
+            for (uint8_t * it = remaining_partitions + num_partitions; it != remaining_partitions; --it)
+            {
+                uint8_t partition = it[-1];
+                size_t start_offset = (partition == 0 ? 0 : partitions[partition - 1].next_offset);
+                size_t end_offset = partitions[partition].next_offset;
+                It partition_begin = begin + start_offset;
+                It partition_end = begin + end_offset;
+                std::ptrdiff_t num_elements = end_offset - start_offset;
+                if (!StdSortIfLessThanThreshold<StdSortThreshold>(partition_begin, partition_end, num_elements, extract_key))
+                {
+                    UnsignedInplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, NumBytes, Offset + 1>::sort(partition_begin, partition_end, num_elements, extract_key, next_sort, sort_data);
+                }
+            }
+        }
+    }
+};
+
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey, size_t NumBytes>
+struct UnsignedInplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, NumBytes, NumBytes>
+{
+    template<typename It, typename ExtractKey>
+    inline static void sort(It begin, It end, std::ptrdiff_t num_elements, ExtractKey & extract_key, void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *), void * next_sort_data)
+    {
+        next_sort(begin, end, num_elements, extract_key, next_sort_data);
+    }
+};
+
+template<typename It, typename ExtractKey, typename ElementKey>
+size_t CommonPrefix(It begin, It end, size_t start_index, ExtractKey && extract_key, ElementKey && element_key)
+{
+    const auto & largest_match_list = extract_key(*begin);
+    size_t largest_match = largest_match_list.size();
+    if (largest_match == start_index)
+        return start_index;
+    for (++begin; begin != end; ++begin)
+    {
+        const auto & current_list = extract_key(*begin);
+        size_t current_size = current_list.size();
+        if (current_size < largest_match)
+        {
+            largest_match = current_size;
+            if (largest_match == start_index)
+                return start_index;
+        }
+        if (element_key(largest_match_list[start_index]) != element_key(current_list[start_index]))
+            return start_index;
+        for (size_t i = start_index + 1; i < largest_match; ++i)
+        {
+            if (element_key(largest_match_list[i]) != element_key(current_list[i]))
+            {
+                largest_match = i;
+                break;
+            }
+        }
+    }
+    return largest_match;
+}
+
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey, typename ListType>
+struct ListInplaceSorter
+{
+    using ElementSubKey = ListElementSubKey<CurrentSubKey, ListType>;
+    template<typename It, typename ExtractKey>
+    static void sort(It begin, It end, ExtractKey & extract_key, ListSortData<It, ExtractKey> * sort_data)
+    {
+        size_t current_index = sort_data->current_index;
+        void * next_sort_data = sort_data->next_sort_data;
+        auto current_key = [&](auto && elem) -> decltype(auto)
+        {
+            return CurrentSubKey::sub_key(extract_key(elem), next_sort_data);
+        };
+        auto element_key = [&](auto && elem) -> decltype(auto)
+        {
+            return ElementSubKey::base::sub_key(elem, sort_data);
+        };
+        sort_data->current_index = current_index = CommonPrefix(begin, end, current_index, current_key, element_key);
+        It end_of_shorter_ones = std::partition(begin, end, [&](auto && elem)
+        {
+            return current_key(elem).size() <= current_index;
+        });
+        std::ptrdiff_t num_shorter_ones = end_of_shorter_ones - begin;
+        if (sort_data->next_sort && !StdSortIfLessThanThreshold<StdSortThreshold>(begin, end_of_shorter_ones, num_shorter_ones, extract_key))
+        {
+            sort_data->next_sort(begin, end_of_shorter_ones, num_shorter_ones, extract_key, next_sort_data);
+        }
+        std::ptrdiff_t num_elements = end - end_of_shorter_ones;
+        if (!StdSortIfLessThanThreshold<StdSortThreshold>(end_of_shorter_ones, end, num_elements, extract_key))
+        {
+            void (*sort_next_element)(It, It, std::ptrdiff_t, ExtractKey &, void *) = static_cast<void (*)(It, It, std::ptrdiff_t, ExtractKey &, void *)>(&sort_from_recursion);
+            InplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, ElementSubKey>::sort(end_of_shorter_ones, end, num_elements, extract_key, sort_next_element, sort_data);
+        }
+    }
+
+    template<typename It, typename ExtractKey>
+    static void sort_from_recursion(It begin, It end, std::ptrdiff_t, ExtractKey & extract_key, void * next_sort_data)
+    {
+        ListSortData<It, ExtractKey> offset = *static_cast<ListSortData<It, ExtractKey> *>(next_sort_data);
+        ++offset.current_index;
+        --offset.recursion_limit;
+        if (offset.recursion_limit == 0)
+        {
+            StdSortFallback(begin, end, extract_key);
+        }
+        else
+        {
+            sort(begin, end, extract_key, &offset);
+        }
+    }
+
+
+    template<typename It, typename ExtractKey>
+    static void sort(It begin, It end, std::ptrdiff_t, ExtractKey & extract_key, void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *), void * next_sort_data)
+    {
+        ListSortData<It, ExtractKey> offset;
+        offset.current_index = 0;
+        offset.recursion_limit = 16;
+        offset.next_sort = next_sort;
+        offset.next_sort_data = next_sort_data;
+        sort(begin, end, extract_key, &offset);
+    }
+};
+
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey>
+struct InplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, bool>
+{
+    template<typename It, typename ExtractKey>
+    static void sort(It begin, It end, std::ptrdiff_t, ExtractKey & extract_key, void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *), void * sort_data)
+    {
+        It middle = std::partition(begin, end, [&](auto && a){ return !CurrentSubKey::sub_key(extract_key(a), sort_data); });
+        if (next_sort)
+        {
+            next_sort(begin, middle, middle - begin, extract_key, sort_data);
+            next_sort(middle, end, end - middle, extract_key, sort_data);
+        }
+    }
+};
+
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey>
+struct InplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, uint8_t> : UnsignedInplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, 1>
+{
+};
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey>
+struct InplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, uint16_t> : UnsignedInplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, 2>
+{
+};
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey>
+struct InplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, uint32_t> : UnsignedInplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, 4>
+{
+};
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey>
+struct InplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, uint64_t> : UnsignedInplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, 8>
+{
+};
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey, typename SubKeyType, typename Enable = void>
+struct FallbackInplaceSorter;
+
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey, typename SubKeyType>
+struct InplaceSorter : FallbackInplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, SubKeyType>
+{
+};
+
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey, typename SubKeyType>
+struct FallbackInplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, SubKeyType, typename std::enable_if<has_subscript_operator<SubKeyType>::value>::type>
+	: ListInplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey, SubKeyType>
+{
+};
+
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey>
+struct SortStarter;
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold>
+struct SortStarter<StdSortThreshold, AmericanFlagSortThreshold, SubKey<void> >
+{
+    template<typename It, typename ExtractKey>
+    static void sort(It, It, std::ptrdiff_t, ExtractKey &, void *)
+    {
+    }
+};
+
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename CurrentSubKey>
+struct SortStarter
+{
+    template<typename It, typename ExtractKey>
+    static void sort(It begin, It end, std::ptrdiff_t num_elements, ExtractKey & extract_key, void * next_sort_data = nullptr)
+    {
+        if (StdSortIfLessThanThreshold<StdSortThreshold>(begin, end, num_elements, extract_key))
+            return;
+
+        void (*next_sort)(It, It, std::ptrdiff_t, ExtractKey &, void *) = static_cast<void (*)(It, It, std::ptrdiff_t, ExtractKey &, void *)>(&SortStarter<StdSortThreshold, AmericanFlagSortThreshold, typename CurrentSubKey::next>::sort);
+        if (next_sort == static_cast<void (*)(It, It, std::ptrdiff_t, ExtractKey &, void *)>(&SortStarter<StdSortThreshold, AmericanFlagSortThreshold, SubKey<void> >::sort))
+            next_sort = nullptr;
+        InplaceSorter<StdSortThreshold, AmericanFlagSortThreshold, CurrentSubKey>::sort(begin, end, num_elements, extract_key, next_sort, next_sort_data);
+    }
+};
+
+template<std::ptrdiff_t StdSortThreshold, std::ptrdiff_t AmericanFlagSortThreshold, typename It, typename ExtractKey>
+void inplace_radix_sort(It begin, It end, ExtractKey & extract_key)
+{
+    using SubKey = SubKey<decltype(extract_key(*begin))>;
+    SortStarter<StdSortThreshold, AmericanFlagSortThreshold, SubKey>::sort(begin, end, end - begin, extract_key);
+}
+
+struct IdentityFunctor
+{
+    template<typename T>
+    decltype(auto) operator()(T && i) const
+    {
+        return std::forward<T>(i);
+    }
+};
+}
+
+template<typename It, typename ExtractKey>
+static void ska_sort(It begin, It end, ExtractKey && extract_key)
+{
+    detail::inplace_radix_sort<128, 1024>(begin, end, extract_key);
+}
+
+template<typename It>
+static void ska_sort(It begin, It end)
+{
+    ska_sort(begin, end, detail::IdentityFunctor());
+}
+
+template<typename It, typename OutIt, typename ExtractKey>
+bool ska_sort_copy(It begin, It end, OutIt buffer_begin, ExtractKey && key)
+{
+    std::ptrdiff_t num_elements = end - begin;
+    if (num_elements < 128 || detail::radix_sort_pass_count<typename std::result_of<ExtractKey(decltype(*begin))>::type> >= 8)
+    {
+        ska_sort(begin, end, key);
+        return false;
+    }
+    else
+        return detail::RadixSorter<typename std::result_of<ExtractKey(decltype(*begin))>::type>::sort(begin, end, buffer_begin, key);
+}
+template<typename It, typename OutIt>
+bool ska_sort_copy(It begin, It end, OutIt buffer_begin)
+{
+    return ska_sort_copy(begin, end, buffer_begin, detail::IdentityFunctor());
+}
+
+#endif /* __cplusplus */
+COSMOPOLITAN_C_START_
+
+void ska_sort_int64(int64_t *, size_t);
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_MEM_SKA_SORT_H_ */
diff --git a/third_party/libcxx/utils.h b/third_party/libcxx/utils.h
new file mode 100644
index 000000000..a3d93c9f7
--- /dev/null
+++ b/third_party/libcxx/utils.h
@@ -0,0 +1,42 @@
+// -*- c++ -*-
+// clang-format off
+#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_
+#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_
+#include "third_party/libcxx/iterator"
+
+namespace learned_sort {
+namespace utils {
+
+// Helper functions for working with unsigned types
+template <typename T>
+inline T _get_key(T a) {
+  return a;
+}
+
+template <class RandomIt>
+void insertion_sort(RandomIt begin, RandomIt end) {
+  // Determine the data type
+  typedef typename std::iterator_traits<RandomIt>::value_type T;
+
+  // Determine the input size
+  const size_t input_sz = std::distance(begin, end);
+
+  if (input_sz <= 0) return;
+
+  RandomIt cmp_idx;
+  T key;
+  for (auto i = begin + 1; i != end; ++i) {
+    key = i[0];
+    cmp_idx = i - 1;
+    while (cmp_idx >= begin && cmp_idx[0] > key) {
+      cmp_idx[1] = cmp_idx[0];
+      --cmp_idx;
+    }
+    cmp_idx[1] = key;
+  }
+}
+
+}  // namespace utils
+}  // namespace learned_sort
+
+#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_ */