mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-30 19:17:36 +00:00
Moar sorting
This commit is contained in:
parent
dbb8d66f22
commit
7f2bf3895e
11 changed files with 3627 additions and 9 deletions
|
@ -16,6 +16,7 @@
|
|||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
#include "libc/mem/alg.h"
|
||||
#include "libc/mem/gc.internal.h"
|
||||
#include "libc/mem/mem.h"
|
||||
|
@ -25,8 +26,24 @@
|
|||
#include "libc/str/str.h"
|
||||
#include "libc/testlib/ezbench.h"
|
||||
#include "libc/testlib/testlib.h"
|
||||
#include "third_party/libcxx/learned_sort.h"
|
||||
#include "third_party/libcxx/pdqsort.h"
|
||||
#include "third_party/libcxx/ska_sort.h"
|
||||
#include "third_party/vqsort/vqsort.h"
|
||||
|
||||
void InsertionSort(long *A, long n) {
|
||||
long i, j, t;
|
||||
for (i = 1; i < n; i++) {
|
||||
t = A[i];
|
||||
j = i - 1;
|
||||
while (j >= 0 && A[j] > t) {
|
||||
A[j + 1] = A[j];
|
||||
j = j - 1;
|
||||
}
|
||||
A[j + 1] = t;
|
||||
}
|
||||
}
|
||||
|
||||
int CompareLong(const void *a, const void *b) {
|
||||
const long *x = a;
|
||||
const long *y = b;
|
||||
|
@ -110,25 +127,87 @@ BENCH(_longsort, bench) {
|
|||
long *p1 = gc(malloc(n * sizeof(long)));
|
||||
long *p2 = gc(malloc(n * sizeof(long)));
|
||||
rngset(p1, n * sizeof(long), 0, 0);
|
||||
EZBENCH2("_longsort", memcpy(p2, p1, n * sizeof(long)), _longsort(p2, n));
|
||||
EZBENCH2("_longsort rand", memcpy(p2, p1, n * sizeof(long)),
|
||||
_longsort(p2, n));
|
||||
if (X86_HAVE(AVX2)) {
|
||||
EZBENCH2("vqsort_int64_avx2", memcpy(p2, p1, n * sizeof(long)),
|
||||
EZBENCH2("vq_int64_avx2 rand", memcpy(p2, p1, n * sizeof(long)),
|
||||
vqsort_int64_avx2(p2, n));
|
||||
}
|
||||
if (X86_HAVE(SSE4_2)) {
|
||||
EZBENCH2("vqsort_int64_sse4", memcpy(p2, p1, n * sizeof(long)),
|
||||
EZBENCH2("vq_int64_sse4 rand", memcpy(p2, p1, n * sizeof(long)),
|
||||
vqsort_int64_sse4(p2, n));
|
||||
}
|
||||
if (X86_HAVE(SSSE3)) {
|
||||
EZBENCH2("vqsort_int64_ssse3", memcpy(p2, p1, n * sizeof(long)),
|
||||
EZBENCH2("vq_int64_ssse3 rand", memcpy(p2, p1, n * sizeof(long)),
|
||||
vqsort_int64_ssse3(p2, n));
|
||||
}
|
||||
EZBENCH2("vqsort_int64_sse2", memcpy(p2, p1, n * sizeof(long)),
|
||||
EZBENCH2("vq_int64_sse2 rand", memcpy(p2, p1, n * sizeof(long)),
|
||||
vqsort_int64_sse2(p2, n));
|
||||
EZBENCH2("radix_sort_int64", memcpy(p2, p1, n * sizeof(long)),
|
||||
EZBENCH2("radix_int64 rand", memcpy(p2, p1, n * sizeof(long)),
|
||||
radix_sort_int64(p2, n));
|
||||
EZBENCH2("qsort(long)", memcpy(p2, p1, n * sizeof(long)),
|
||||
EZBENCH2("ska_sort_int64 rand", memcpy(p2, p1, n * sizeof(long)),
|
||||
ska_sort_int64(p2, n));
|
||||
EZBENCH2("pdq_int64 rand", memcpy(p2, p1, n * sizeof(long)),
|
||||
pdqsort_int64(p2, n));
|
||||
EZBENCH2("pdq_branchless rand", memcpy(p2, p1, n * sizeof(long)),
|
||||
pdqsort_branchless_int64(p2, n));
|
||||
EZBENCH2("learned_int64 rand", memcpy(p2, p1, n * sizeof(long)),
|
||||
learned_sort_int64(p2, n));
|
||||
EZBENCH2("qsort(long) rand", memcpy(p2, p1, n * sizeof(long)),
|
||||
qsort(p2, n, sizeof(long), CompareLong));
|
||||
EZBENCH2("InsertionSort rand", memcpy(p2, p1, n * sizeof(long)),
|
||||
InsertionSort(p2, n));
|
||||
}
|
||||
|
||||
BENCH(_longsort, benchNearlySorted) {
|
||||
printf("\n");
|
||||
long i, j, k, t;
|
||||
long n = 5000; // total items
|
||||
long m = 500; // misplaced items
|
||||
long d = 5; // maximum drift
|
||||
long *p1 = gc(malloc(n * sizeof(long)));
|
||||
long *p2 = gc(malloc(n * sizeof(long)));
|
||||
for (i = 0; i < n; ++i) {
|
||||
p1[i] = i;
|
||||
}
|
||||
for (i = 0; i < m; ++i) {
|
||||
j = rand() % n;
|
||||
k = rand() % (d * 2) - d + j;
|
||||
k = MIN(MAX(0, k), n - 1);
|
||||
t = p1[j];
|
||||
p1[j] = p1[k];
|
||||
p1[k] = t;
|
||||
}
|
||||
EZBENCH2("_longsort near", memcpy(p2, p1, n * sizeof(long)),
|
||||
_longsort(p2, n));
|
||||
if (X86_HAVE(AVX2)) {
|
||||
EZBENCH2("vq_int64_avx2 near", memcpy(p2, p1, n * sizeof(long)),
|
||||
vqsort_int64_avx2(p2, n));
|
||||
}
|
||||
if (X86_HAVE(SSE4_2)) {
|
||||
EZBENCH2("vq_int64_sse4 near", memcpy(p2, p1, n * sizeof(long)),
|
||||
vqsort_int64_sse4(p2, n));
|
||||
}
|
||||
if (X86_HAVE(SSSE3)) {
|
||||
EZBENCH2("vq_int64_ssse3 near", memcpy(p2, p1, n * sizeof(long)),
|
||||
vqsort_int64_ssse3(p2, n));
|
||||
}
|
||||
EZBENCH2("vq_int64_sse2 near", memcpy(p2, p1, n * sizeof(long)),
|
||||
vqsort_int64_sse2(p2, n));
|
||||
EZBENCH2("radix_int64 near", memcpy(p2, p1, n * sizeof(long)),
|
||||
radix_sort_int64(p2, n));
|
||||
EZBENCH2("ska_sort_int64 near", memcpy(p2, p1, n * sizeof(long)),
|
||||
ska_sort_int64(p2, n));
|
||||
EZBENCH2("pdq_int64 near", memcpy(p2, p1, n * sizeof(long)),
|
||||
pdqsort_int64(p2, n));
|
||||
EZBENCH2("pdq_branchless near", memcpy(p2, p1, n * sizeof(long)),
|
||||
pdqsort_branchless_int64(p2, n));
|
||||
EZBENCH2("learned_int64 near", memcpy(p2, p1, n * sizeof(long)),
|
||||
learned_sort_int64(p2, n));
|
||||
EZBENCH2("qsort(long) near", memcpy(p2, p1, n * sizeof(long)),
|
||||
qsort(p2, n, sizeof(long), CompareLong));
|
||||
EZBENCH2("InsertionSort near", memcpy(p2, p1, n * sizeof(long)),
|
||||
InsertionSort(p2, n));
|
||||
}
|
||||
|
||||
int CompareInt(const void *a, const void *b) {
|
||||
|
@ -215,6 +294,7 @@ BENCH(_intsort, bench) {
|
|||
int *p2 = gc(malloc(n * sizeof(int)));
|
||||
rngset(p1, n * sizeof(int), 0, 0);
|
||||
EZBENCH2("_intsort", memcpy(p2, p1, n * sizeof(int)), _intsort(p2, n));
|
||||
EZBENCH2("djbsort", memcpy(p2, p1, n * sizeof(int)), djbsort(p2, n));
|
||||
if (X86_HAVE(AVX2)) {
|
||||
EZBENCH2("vqsort_int32_avx2", memcpy(p2, p1, n * sizeof(int)),
|
||||
vqsort_int32_avx2(p2, n));
|
||||
|
@ -229,7 +309,6 @@ BENCH(_intsort, bench) {
|
|||
}
|
||||
EZBENCH2("vqsort_int32_sse2", memcpy(p2, p1, n * sizeof(int)),
|
||||
vqsort_int32_sse2(p2, n));
|
||||
EZBENCH2("djbsort", memcpy(p2, p1, n * sizeof(int)), djbsort(p2, n));
|
||||
EZBENCH2("radix_sort_int32", memcpy(p2, p1, n * sizeof(int)),
|
||||
radix_sort_int32(p2, n));
|
||||
EZBENCH2("qsort(int)", memcpy(p2, p1, n * sizeof(int)),
|
||||
|
|
2
third_party/ggml/llama_util.h
vendored
2
third_party/ggml/llama_util.h
vendored
|
@ -1,6 +1,6 @@
|
|||
// -*- c++ -*-
|
||||
// Internal header to be included only by llama.cpp.
|
||||
// Contains wrappers around OS interfaces.
|
||||
|
||||
#ifndef LLAMA_UTIL_H
|
||||
#define LLAMA_UTIL_H
|
||||
#include "libc/calls/struct/rlimit.h"
|
||||
|
|
25
third_party/libcxx/learned_sort.cc
vendored
Normal file
25
third_party/libcxx/learned_sort.cc
vendored
Normal file
|
@ -0,0 +1,25 @@
|
|||
/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
|
||||
│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2023 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "third_party/libcxx/learned_sort.h"
|
||||
|
||||
void learned_sort_int64(int64_t* A, size_t n) {
|
||||
std::vector<int64_t> v(A, A + n);
|
||||
learned_sort::sort(v.begin(), v.end());
|
||||
std::copy(v.begin(), v.end(), A);
|
||||
}
|
1100
third_party/libcxx/learned_sort.h
vendored
Normal file
1100
third_party/libcxx/learned_sort.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
8
third_party/libcxx/libcxx.mk
vendored
8
third_party/libcxx/libcxx.mk
vendored
|
@ -8,6 +8,11 @@ THIRD_PARTY_LIBCXX = $(THIRD_PARTY_LIBCXX_A_DEPS) $(THIRD_PARTY_LIBCXX_A)
|
|||
THIRD_PARTY_LIBCXX_A = o/$(MODE)/third_party/libcxx/libcxx.a
|
||||
|
||||
THIRD_PARTY_LIBCXX_A_HDRS = \
|
||||
third_party/libcxx/rmi.h \
|
||||
third_party/libcxx/utils.h \
|
||||
third_party/libcxx/learned_sort.h \
|
||||
third_party/libcxx/pdqsort.h \
|
||||
third_party/libcxx/ska_sort.h \
|
||||
third_party/libcxx/__bit_reference \
|
||||
third_party/libcxx/__bsd_locale_fallbacks.h \
|
||||
third_party/libcxx/__config \
|
||||
|
@ -138,6 +143,9 @@ THIRD_PARTY_LIBCXX_A_HDRS = \
|
|||
third_party/libcxx/wctype.h
|
||||
|
||||
THIRD_PARTY_LIBCXX_A_SRCS_CC = \
|
||||
third_party/libcxx/learned_sort.cc \
|
||||
third_party/libcxx/pdqsort.cc \
|
||||
third_party/libcxx/ska_sort.cc \
|
||||
third_party/libcxx/algorithm.cc \
|
||||
third_party/libcxx/charconv.cc \
|
||||
third_party/libcxx/chrono.cc \
|
||||
|
|
25
third_party/libcxx/pdqsort.cc
vendored
Normal file
25
third_party/libcxx/pdqsort.cc
vendored
Normal file
|
@ -0,0 +1,25 @@
|
|||
/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
|
||||
│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2023 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "third_party/libcxx/pdqsort.h"
|
||||
|
||||
void pdqsort_int64(int64_t* A, size_t n) { pdqsort(A, A + n); }
|
||||
|
||||
void pdqsort_branchless_int64(int64_t* A, size_t n) {
|
||||
pdqsort_branchless(A, A + n);
|
||||
}
|
552
third_party/libcxx/pdqsort.h
vendored
Normal file
552
third_party/libcxx/pdqsort.h
vendored
Normal file
|
@ -0,0 +1,552 @@
|
|||
/* -*- c++ -*- */
|
||||
/* clang-format off */
|
||||
/*
|
||||
pdqsort.h - Pattern-defeating quicksort.
|
||||
|
||||
Copyright (c) 2015 Orson Peters
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty. In no event will the
|
||||
authors be held liable for any damages arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose, including commercial
|
||||
applications, and to alter it and redistribute it freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the
|
||||
original software. If you use this software in a product, an acknowledgment in the product
|
||||
documentation would be appreciated but is not required.
|
||||
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as
|
||||
being the original software.
|
||||
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef PDQSORT_H
|
||||
#define PDQSORT_H
|
||||
#ifdef __cplusplus
|
||||
#include "third_party/libcxx/algorithm"
|
||||
#include "third_party/libcxx/cstddef"
|
||||
#include "third_party/libcxx/functional"
|
||||
#include "third_party/libcxx/utility"
|
||||
#include "third_party/libcxx/iterator"
|
||||
|
||||
#if __cplusplus >= 201103L
|
||||
#include "third_party/libcxx/cstdint"
|
||||
#include "third_party/libcxx/type_traits"
|
||||
#define PDQSORT_PREFER_MOVE(x) std::move(x)
|
||||
#else
|
||||
#define PDQSORT_PREFER_MOVE(x) (x)
|
||||
#endif
|
||||
|
||||
|
||||
namespace pdqsort_detail {
|
||||
enum {
|
||||
// Partitions below this size are sorted using insertion sort.
|
||||
insertion_sort_threshold = 24,
|
||||
|
||||
// Partitions above this size use Tukey's ninther to select the pivot.
|
||||
ninther_threshold = 128,
|
||||
|
||||
// When we detect an already sorted partition, attempt an insertion sort that allows this
|
||||
// amount of element moves before giving up.
|
||||
partial_insertion_sort_limit = 8,
|
||||
|
||||
// Must be multiple of 8 due to loop unrolling, and < 256 to fit in unsigned char.
|
||||
block_size = 64,
|
||||
|
||||
// Cacheline size, assumes power of two.
|
||||
cacheline_size = 64
|
||||
|
||||
};
|
||||
|
||||
#if __cplusplus >= 201103L
|
||||
template<class T> struct is_default_compare : std::false_type { };
|
||||
template<class T> struct is_default_compare<std::less<T> > : std::true_type { };
|
||||
template<class T> struct is_default_compare<std::greater<T> > : std::true_type { };
|
||||
#endif
|
||||
|
||||
// Returns floor(log2(n)), assumes n > 0.
|
||||
template<class T>
|
||||
inline int log2(T n) {
|
||||
int log = 0;
|
||||
while (n >>= 1) ++log;
|
||||
return log;
|
||||
}
|
||||
|
||||
// Sorts [begin, end) using insertion sort with the given comparison function.
|
||||
template<class Iter, class Compare>
|
||||
inline void insertion_sort(Iter begin, Iter end, Compare comp) {
|
||||
typedef typename std::iterator_traits<Iter>::value_type T;
|
||||
if (begin == end) return;
|
||||
|
||||
for (Iter cur = begin + 1; cur != end; ++cur) {
|
||||
Iter sift = cur;
|
||||
Iter sift_1 = cur - 1;
|
||||
|
||||
// Compare first so we can avoid 2 moves for an element already positioned correctly.
|
||||
if (comp(*sift, *sift_1)) {
|
||||
T tmp = PDQSORT_PREFER_MOVE(*sift);
|
||||
|
||||
do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
|
||||
while (sift != begin && comp(tmp, *--sift_1));
|
||||
|
||||
*sift = PDQSORT_PREFER_MOVE(tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sorts [begin, end) using insertion sort with the given comparison function. Assumes
|
||||
// *(begin - 1) is an element smaller than or equal to any element in [begin, end).
|
||||
template<class Iter, class Compare>
|
||||
inline void unguarded_insertion_sort(Iter begin, Iter end, Compare comp) {
|
||||
typedef typename std::iterator_traits<Iter>::value_type T;
|
||||
if (begin == end) return;
|
||||
|
||||
for (Iter cur = begin + 1; cur != end; ++cur) {
|
||||
Iter sift = cur;
|
||||
Iter sift_1 = cur - 1;
|
||||
|
||||
// Compare first so we can avoid 2 moves for an element already positioned correctly.
|
||||
if (comp(*sift, *sift_1)) {
|
||||
T tmp = PDQSORT_PREFER_MOVE(*sift);
|
||||
|
||||
do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
|
||||
while (comp(tmp, *--sift_1));
|
||||
|
||||
*sift = PDQSORT_PREFER_MOVE(tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Attempts to use insertion sort on [begin, end). Will return false if more than
|
||||
// partial_insertion_sort_limit elements were moved, and abort sorting. Otherwise it will
|
||||
// successfully sort and return true.
|
||||
template<class Iter, class Compare>
|
||||
inline bool partial_insertion_sort(Iter begin, Iter end, Compare comp) {
|
||||
typedef typename std::iterator_traits<Iter>::value_type T;
|
||||
if (begin == end) return true;
|
||||
|
||||
std::size_t limit = 0;
|
||||
for (Iter cur = begin + 1; cur != end; ++cur) {
|
||||
Iter sift = cur;
|
||||
Iter sift_1 = cur - 1;
|
||||
|
||||
// Compare first so we can avoid 2 moves for an element already positioned correctly.
|
||||
if (comp(*sift, *sift_1)) {
|
||||
T tmp = PDQSORT_PREFER_MOVE(*sift);
|
||||
|
||||
do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
|
||||
while (sift != begin && comp(tmp, *--sift_1));
|
||||
|
||||
*sift = PDQSORT_PREFER_MOVE(tmp);
|
||||
limit += cur - sift;
|
||||
}
|
||||
|
||||
if (limit > partial_insertion_sort_limit) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template<class Iter, class Compare>
|
||||
inline void sort2(Iter a, Iter b, Compare comp) {
|
||||
if (comp(*b, *a)) std::iter_swap(a, b);
|
||||
}
|
||||
|
||||
// Sorts the elements *a, *b and *c using comparison function comp.
|
||||
template<class Iter, class Compare>
|
||||
inline void sort3(Iter a, Iter b, Iter c, Compare comp) {
|
||||
sort2(a, b, comp);
|
||||
sort2(b, c, comp);
|
||||
sort2(a, b, comp);
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline T* align_cacheline(T* p) {
|
||||
#if defined(UINTPTR_MAX) && __cplusplus >= 201103L
|
||||
std::uintptr_t ip = reinterpret_cast<std::uintptr_t>(p);
|
||||
#else
|
||||
std::size_t ip = reinterpret_cast<std::size_t>(p);
|
||||
#endif
|
||||
ip = (ip + cacheline_size - 1) & -cacheline_size;
|
||||
return reinterpret_cast<T*>(ip);
|
||||
}
|
||||
|
||||
template<class Iter>
|
||||
inline void swap_offsets(Iter first, Iter last,
|
||||
unsigned char* offsets_l, unsigned char* offsets_r,
|
||||
int num, bool use_swaps) {
|
||||
typedef typename std::iterator_traits<Iter>::value_type T;
|
||||
if (use_swaps) {
|
||||
// This case is needed for the descending distribution, where we need
|
||||
// to have proper swapping for pdqsort to remain O(n).
|
||||
for (int i = 0; i < num; ++i) {
|
||||
std::iter_swap(first + offsets_l[i], last - offsets_r[i]);
|
||||
}
|
||||
} else if (num > 0) {
|
||||
Iter l = first + offsets_l[0]; Iter r = last - offsets_r[0];
|
||||
T tmp(PDQSORT_PREFER_MOVE(*l)); *l = PDQSORT_PREFER_MOVE(*r);
|
||||
for (int i = 1; i < num; ++i) {
|
||||
l = first + offsets_l[i]; *r = PDQSORT_PREFER_MOVE(*l);
|
||||
r = last - offsets_r[i]; *l = PDQSORT_PREFER_MOVE(*r);
|
||||
}
|
||||
*r = PDQSORT_PREFER_MOVE(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
// Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
|
||||
// to the pivot are put in the right-hand partition. Returns the position of the pivot after
|
||||
// partitioning and whether the passed sequence already was correctly partitioned. Assumes the
|
||||
// pivot is a median of at least 3 elements and that [begin, end) is at least
|
||||
// insertion_sort_threshold long. Uses branchless partitioning.
|
||||
template<class Iter, class Compare>
|
||||
inline std::pair<Iter, bool> partition_right_branchless(Iter begin, Iter end, Compare comp) {
|
||||
typedef typename std::iterator_traits<Iter>::value_type T;
|
||||
|
||||
// Move pivot into local for speed.
|
||||
T pivot(PDQSORT_PREFER_MOVE(*begin));
|
||||
Iter first = begin;
|
||||
Iter last = end;
|
||||
|
||||
// Find the first element greater than or equal than the pivot (the median of 3 guarantees
|
||||
// this exists).
|
||||
while (comp(*++first, pivot));
|
||||
|
||||
// Find the first element strictly smaller than the pivot. We have to guard this search if
|
||||
// there was no element before *first.
|
||||
if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
|
||||
else while ( !comp(*--last, pivot));
|
||||
|
||||
// If the first pair of elements that should be swapped to partition are the same element,
|
||||
// the passed in sequence already was correctly partitioned.
|
||||
bool already_partitioned = first >= last;
|
||||
if (!already_partitioned) {
|
||||
std::iter_swap(first, last);
|
||||
++first;
|
||||
}
|
||||
|
||||
// The following branchless partitioning is derived from "BlockQuicksort: How Branch
|
||||
// Mispredictions don’t affect Quicksort" by Stefan Edelkamp and Armin Weiss.
|
||||
unsigned char offsets_l_storage[block_size + cacheline_size];
|
||||
unsigned char offsets_r_storage[block_size + cacheline_size];
|
||||
unsigned char* offsets_l = align_cacheline(offsets_l_storage);
|
||||
unsigned char* offsets_r = align_cacheline(offsets_r_storage);
|
||||
int num_l, num_r, start_l, start_r;
|
||||
num_l = num_r = start_l = start_r = 0;
|
||||
|
||||
while (last - first > 2 * block_size) {
|
||||
// Fill up offset blocks with elements that are on the wrong side.
|
||||
if (num_l == 0) {
|
||||
start_l = 0;
|
||||
Iter it = first;
|
||||
for (unsigned char i = 0; i < block_size;) {
|
||||
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
|
||||
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
|
||||
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
|
||||
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
|
||||
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
|
||||
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
|
||||
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
|
||||
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
|
||||
}
|
||||
}
|
||||
if (num_r == 0) {
|
||||
start_r = 0;
|
||||
Iter it = last;
|
||||
for (unsigned char i = 0; i < block_size;) {
|
||||
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
|
||||
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
|
||||
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
|
||||
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
|
||||
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
|
||||
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
|
||||
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
|
||||
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
|
||||
}
|
||||
}
|
||||
|
||||
// Swap elements and update block sizes and first/last boundaries.
|
||||
int num = std::min(num_l, num_r);
|
||||
swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r,
|
||||
num, num_l == num_r);
|
||||
num_l -= num; num_r -= num;
|
||||
start_l += num; start_r += num;
|
||||
if (num_l == 0) first += block_size;
|
||||
if (num_r == 0) last -= block_size;
|
||||
}
|
||||
|
||||
int l_size = 0, r_size = 0;
|
||||
int unknown_left = (int)(last - first) - ((num_r || num_l) ? block_size : 0);
|
||||
if (num_r) {
|
||||
// Handle leftover block by assigning the unknown elements to the other block.
|
||||
l_size = unknown_left;
|
||||
r_size = block_size;
|
||||
} else if (num_l) {
|
||||
l_size = block_size;
|
||||
r_size = unknown_left;
|
||||
} else {
|
||||
// No leftover block, split the unknown elements in two blocks.
|
||||
l_size = unknown_left/2;
|
||||
r_size = unknown_left - l_size;
|
||||
}
|
||||
|
||||
// Fill offset buffers if needed.
|
||||
if (unknown_left && !num_l) {
|
||||
start_l = 0;
|
||||
Iter it = first;
|
||||
for (unsigned char i = 0; i < l_size;) {
|
||||
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
|
||||
}
|
||||
}
|
||||
if (unknown_left && !num_r) {
|
||||
start_r = 0;
|
||||
Iter it = last;
|
||||
for (unsigned char i = 0; i < r_size;) {
|
||||
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
|
||||
}
|
||||
}
|
||||
|
||||
int num = std::min(num_l, num_r);
|
||||
swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r, num, num_l == num_r);
|
||||
num_l -= num; num_r -= num;
|
||||
start_l += num; start_r += num;
|
||||
if (num_l == 0) first += l_size;
|
||||
if (num_r == 0) last -= r_size;
|
||||
|
||||
// We have now fully identified [first, last)'s proper position. Swap the last elements.
|
||||
if (num_l) {
|
||||
offsets_l += start_l;
|
||||
while (num_l--) std::iter_swap(first + offsets_l[num_l], --last);
|
||||
first = last;
|
||||
}
|
||||
if (num_r) {
|
||||
offsets_r += start_r;
|
||||
while (num_r--) std::iter_swap(last - offsets_r[num_r], first), ++first;
|
||||
last = first;
|
||||
}
|
||||
|
||||
// Put the pivot in the right place.
|
||||
Iter pivot_pos = first - 1;
|
||||
*begin = PDQSORT_PREFER_MOVE(*pivot_pos);
|
||||
*pivot_pos = PDQSORT_PREFER_MOVE(pivot);
|
||||
|
||||
return std::make_pair(pivot_pos, already_partitioned);
|
||||
}
|
||||
|
||||
// Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
|
||||
// to the pivot are put in the right-hand partition. Returns the position of the pivot after
|
||||
// partitioning and whether the passed sequence already was correctly partitioned. Assumes the
|
||||
// pivot is a median of at least 3 elements and that [begin, end) is at least
|
||||
// insertion_sort_threshold long.
|
||||
template<class Iter, class Compare>
|
||||
inline std::pair<Iter, bool> partition_right(Iter begin, Iter end, Compare comp) {
|
||||
typedef typename std::iterator_traits<Iter>::value_type T;
|
||||
|
||||
// Move pivot into local for speed.
|
||||
T pivot(PDQSORT_PREFER_MOVE(*begin));
|
||||
|
||||
Iter first = begin;
|
||||
Iter last = end;
|
||||
|
||||
// Find the first element greater than or equal than the pivot (the median of 3 guarantees
|
||||
// this exists).
|
||||
while (comp(*++first, pivot));
|
||||
|
||||
// Find the first element strictly smaller than the pivot. We have to guard this search if
|
||||
// there was no element before *first.
|
||||
if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
|
||||
else while ( !comp(*--last, pivot));
|
||||
|
||||
// If the first pair of elements that should be swapped to partition are the same element,
|
||||
// the passed in sequence already was correctly partitioned.
|
||||
bool already_partitioned = first >= last;
|
||||
|
||||
// Keep swapping pairs of elements that are on the wrong side of the pivot. Previously
|
||||
// swapped pairs guard the searches, which is why the first iteration is special-cased
|
||||
// above.
|
||||
while (first < last) {
|
||||
std::iter_swap(first, last);
|
||||
while (comp(*++first, pivot));
|
||||
while (!comp(*--last, pivot));
|
||||
}
|
||||
|
||||
// Put the pivot in the right place.
|
||||
Iter pivot_pos = first - 1;
|
||||
*begin = PDQSORT_PREFER_MOVE(*pivot_pos);
|
||||
*pivot_pos = PDQSORT_PREFER_MOVE(pivot);
|
||||
|
||||
return std::make_pair(pivot_pos, already_partitioned);
|
||||
}
|
||||
|
||||
// Similar function to the one above, except elements equal to the pivot are put to the left of
|
||||
// the pivot and it doesn't check or return if the passed sequence already was partitioned.
|
||||
// Since this is rarely used (the many equal case), and in that case pdqsort already has O(n)
|
||||
// performance, no block quicksort is applied here for simplicity.
|
||||
template<class Iter, class Compare>
|
||||
inline Iter partition_left(Iter begin, Iter end, Compare comp) {
|
||||
typedef typename std::iterator_traits<Iter>::value_type T;
|
||||
|
||||
T pivot(PDQSORT_PREFER_MOVE(*begin));
|
||||
Iter first = begin;
|
||||
Iter last = end;
|
||||
|
||||
while (comp(pivot, *--last));
|
||||
|
||||
if (last + 1 == end) while (first < last && !comp(pivot, *++first));
|
||||
else while ( !comp(pivot, *++first));
|
||||
|
||||
while (first < last) {
|
||||
std::iter_swap(first, last);
|
||||
while (comp(pivot, *--last));
|
||||
while (!comp(pivot, *++first));
|
||||
}
|
||||
|
||||
Iter pivot_pos = last;
|
||||
*begin = PDQSORT_PREFER_MOVE(*pivot_pos);
|
||||
*pivot_pos = PDQSORT_PREFER_MOVE(pivot);
|
||||
|
||||
return pivot_pos;
|
||||
}
|
||||
|
||||
|
||||
template<class Iter, class Compare, bool Branchless>
|
||||
inline void pdqsort_loop(Iter begin, Iter end, Compare comp, int bad_allowed, bool leftmost = true) {
|
||||
typedef typename std::iterator_traits<Iter>::difference_type diff_t;
|
||||
|
||||
// Use a while loop for tail recursion elimination.
|
||||
while (true) {
|
||||
diff_t size = end - begin;
|
||||
|
||||
// Insertion sort is faster for small arrays.
|
||||
if (size < insertion_sort_threshold) {
|
||||
if (leftmost) insertion_sort(begin, end, comp);
|
||||
else unguarded_insertion_sort(begin, end, comp);
|
||||
return;
|
||||
}
|
||||
|
||||
// Choose pivot as median of 3 or pseudomedian of 9.
|
||||
diff_t s2 = size / 2;
|
||||
if (size > ninther_threshold) {
|
||||
sort3(begin, begin + s2, end - 1, comp);
|
||||
sort3(begin + 1, begin + (s2 - 1), end - 2, comp);
|
||||
sort3(begin + 2, begin + (s2 + 1), end - 3, comp);
|
||||
sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp);
|
||||
std::iter_swap(begin, begin + s2);
|
||||
} else sort3(begin + s2, begin, end - 1, comp);
|
||||
|
||||
// If *(begin - 1) is the end of the right partition of a previous partition operation
|
||||
// there is no element in [begin, end) that is smaller than *(begin - 1). Then if our
|
||||
// pivot compares equal to *(begin - 1) we change strategy, putting equal elements in
|
||||
// the left partition, greater elements in the right partition. We do not have to
|
||||
// recurse on the left partition, since it's sorted (all equal).
|
||||
if (!leftmost && !comp(*(begin - 1), *begin)) {
|
||||
begin = partition_left(begin, end, comp) + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Partition and get results.
|
||||
std::pair<Iter, bool> part_result =
|
||||
Branchless ? partition_right_branchless(begin, end, comp)
|
||||
: partition_right(begin, end, comp);
|
||||
Iter pivot_pos = part_result.first;
|
||||
bool already_partitioned = part_result.second;
|
||||
|
||||
// Check for a highly unbalanced partition.
|
||||
diff_t l_size = pivot_pos - begin;
|
||||
diff_t r_size = end - (pivot_pos + 1);
|
||||
bool highly_unbalanced = l_size < size / 8 || r_size < size / 8;
|
||||
|
||||
// If we got a highly unbalanced partition we shuffle elements to break many patterns.
|
||||
if (highly_unbalanced) {
|
||||
// If we had too many bad partitions, switch to heapsort to guarantee O(n log n).
|
||||
if (--bad_allowed == 0) {
|
||||
std::make_heap(begin, end, comp);
|
||||
std::sort_heap(begin, end, comp);
|
||||
return;
|
||||
}
|
||||
|
||||
if (l_size >= insertion_sort_threshold) {
|
||||
std::iter_swap(begin, begin + l_size / 4);
|
||||
std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4);
|
||||
|
||||
if (l_size > ninther_threshold) {
|
||||
std::iter_swap(begin + 1, begin + (l_size / 4 + 1));
|
||||
std::iter_swap(begin + 2, begin + (l_size / 4 + 2));
|
||||
std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1));
|
||||
std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2));
|
||||
}
|
||||
}
|
||||
|
||||
if (r_size >= insertion_sort_threshold) {
|
||||
std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4));
|
||||
std::iter_swap(end - 1, end - r_size / 4);
|
||||
|
||||
if (r_size > ninther_threshold) {
|
||||
std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4));
|
||||
std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4));
|
||||
std::iter_swap(end - 2, end - (1 + r_size / 4));
|
||||
std::iter_swap(end - 3, end - (2 + r_size / 4));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If we were decently balanced and we tried to sort an already partitioned
|
||||
// sequence try to use insertion sort.
|
||||
if (already_partitioned && partial_insertion_sort(begin, pivot_pos, comp)
|
||||
&& partial_insertion_sort(pivot_pos + 1, end, comp)) return;
|
||||
}
|
||||
|
||||
// Sort the left partition first using recursion and do tail recursion elimination for
|
||||
// the right-hand partition.
|
||||
pdqsort_loop<Iter, Compare, Branchless>(begin, pivot_pos, comp, bad_allowed, leftmost);
|
||||
begin = pivot_pos + 1;
|
||||
leftmost = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class Iter, class Compare>
|
||||
inline void pdqsort(Iter begin, Iter end, Compare comp) {
|
||||
if (begin == end) return;
|
||||
|
||||
#if __cplusplus >= 201103L
|
||||
pdqsort_detail::pdqsort_loop<Iter, Compare,
|
||||
pdqsort_detail::is_default_compare<typename std::decay<Compare>::type>::value &&
|
||||
std::is_arithmetic<typename std::iterator_traits<Iter>::value_type>::value>(
|
||||
begin, end, comp, pdqsort_detail::log2(end - begin));
|
||||
#else
|
||||
pdqsort_detail::pdqsort_loop<Iter, Compare, false>(
|
||||
begin, end, comp, pdqsort_detail::log2(end - begin));
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Iter>
|
||||
inline void pdqsort(Iter begin, Iter end) {
|
||||
typedef typename std::iterator_traits<Iter>::value_type T;
|
||||
pdqsort(begin, end, std::less<T>());
|
||||
}
|
||||
|
||||
template<class Iter, class Compare>
|
||||
inline void pdqsort_branchless(Iter begin, Iter end, Compare comp) {
|
||||
if (begin == end) return;
|
||||
pdqsort_detail::pdqsort_loop<Iter, Compare, true>(
|
||||
begin, end, comp, pdqsort_detail::log2(end - begin));
|
||||
}
|
||||
|
||||
template<class Iter>
|
||||
inline void pdqsort_branchless(Iter begin, Iter end) {
|
||||
typedef typename std::iterator_traits<Iter>::value_type T;
|
||||
pdqsort_branchless(begin, end, std::less<T>());
|
||||
}
|
||||
|
||||
|
||||
#undef PDQSORT_PREFER_MOVE
|
||||
#endif /* __cplusplus */
|
||||
COSMOPOLITAN_C_START_
|
||||
|
||||
void pdqsort_int64(int64_t *, size_t);
|
||||
void pdqsort_branchless_int64(int64_t *, size_t);
|
||||
|
||||
COSMOPOLITAN_C_END_
|
||||
#endif /* PDQSORT_H */
|
308
third_party/libcxx/rmi.h
vendored
Normal file
308
third_party/libcxx/rmi.h
vendored
Normal file
|
@ -0,0 +1,308 @@
|
|||
// -*- c++ -*-
|
||||
// clang-format off
|
||||
#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_
|
||||
#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_
|
||||
#include "third_party/libcxx/iostream"
|
||||
#include "third_party/libcxx/vector"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace learned_sort {
|
||||
|
||||
// Packs the key and its respective scaled CDF value
|
||||
template <typename T>
|
||||
struct training_point {
|
||||
T x;
|
||||
double y;
|
||||
};
|
||||
|
||||
// Represents linear models
|
||||
struct linear_model {
|
||||
double slope = 0;
|
||||
double intercept = 0;
|
||||
};
|
||||
|
||||
// An implementation of a 2-layer RMI model
|
||||
template <class T>
|
||||
class TwoLayerRMI {
|
||||
public:
|
||||
// CDF model hyperparameters
|
||||
struct Params {
|
||||
// Member fields
|
||||
long fanout;
|
||||
float sampling_rate;
|
||||
long threshold;
|
||||
long num_leaf_models;
|
||||
|
||||
// Default hyperparameters
|
||||
static constexpr long DEFAULT_FANOUT = 1e3;
|
||||
static constexpr float DEFAULT_SAMPLING_RATE = .01;
|
||||
static constexpr long DEFAULT_THRESHOLD = 100;
|
||||
static constexpr long DEFAULT_NUM_LEAF_MODELS = 1000;
|
||||
static constexpr long MIN_SORTING_SIZE = 1e4;
|
||||
|
||||
// Default constructor
|
||||
Params() {
|
||||
this->fanout = DEFAULT_FANOUT;
|
||||
this->sampling_rate = DEFAULT_SAMPLING_RATE;
|
||||
this->threshold = DEFAULT_THRESHOLD;
|
||||
this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS;
|
||||
}
|
||||
|
||||
// Constructor with custom hyperparameter values
|
||||
Params(float sampling_rate, float overallocation, long fanout,
|
||||
long batch_sz, long threshold) {
|
||||
this->fanout = fanout;
|
||||
this->sampling_rate = sampling_rate;
|
||||
this->threshold = threshold;
|
||||
this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS;
|
||||
}
|
||||
};
|
||||
|
||||
// Member variables of the CDF model
|
||||
bool trained;
|
||||
linear_model root_model;
|
||||
vector<linear_model> leaf_models;
|
||||
vector<T> training_sample;
|
||||
Params hp;
|
||||
bool enable_dups_detection;
|
||||
|
||||
// CDF model constructor
|
||||
TwoLayerRMI(Params p) {
|
||||
this->trained = false;
|
||||
this->hp = p;
|
||||
this->leaf_models.resize(p.num_leaf_models);
|
||||
this->enable_dups_detection = true;
|
||||
}
|
||||
|
||||
// Pretty-printing function
|
||||
void print() {
|
||||
printf("[0][0]: slope=%0.5f; intercept=%0.5f;\n", root_model.slope,
|
||||
root_model.intercept);
|
||||
for (int model_idx = 0; model_idx < hp.num_leaf_models; ++model_idx) {
|
||||
printf("[%i][1]: slope=%0.5f; intercept=%0.5f;\n", model_idx,
|
||||
leaf_models[model_idx].slope, leaf_models[model_idx].intercept);
|
||||
}
|
||||
cout << "-----------------------------" << endl;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Train a CDF function with an RMI architecture, using linear spline
|
||||
* interpolation.
|
||||
*
|
||||
* @param begin Random-access iterators to the initial position of the
|
||||
* sequence to be used for sorting. The range used is [begin,end), which
|
||||
* contains all the elements between first and last, including the element
|
||||
* pointed by first but not the element pointed by last.
|
||||
* @param end Random-access iterators to the last position of the sequence to
|
||||
* be used for sorting. The range used is [begin,end), which contains all the
|
||||
* elements between first and last, including the element pointed by first but
|
||||
* not the element pointed by last.
|
||||
* @return true if the model was trained successfully, false otherwise.
|
||||
*/
|
||||
bool train(typename vector<T>::iterator begin,
|
||||
typename vector<T>::iterator end) {
|
||||
// Determine input size
|
||||
const long INPUT_SZ = std::distance(begin, end);
|
||||
|
||||
// Validate parameters
|
||||
if (this->hp.fanout >= INPUT_SZ) {
|
||||
this->hp.fanout = TwoLayerRMI<T>::Params::DEFAULT_FANOUT;
|
||||
cerr << "\33[93;1mWARNING\33[0m: Invalid fanout. Using default ("
|
||||
<< TwoLayerRMI<T>::Params::DEFAULT_FANOUT << ")." << endl;
|
||||
}
|
||||
|
||||
if (this->hp.sampling_rate <= 0 or this->hp.sampling_rate > 1) {
|
||||
this->hp.sampling_rate = TwoLayerRMI<T>::Params::DEFAULT_SAMPLING_RATE;
|
||||
cerr << "\33[93;1mWARNING\33[0m: Invalid sampling rate. Using default ("
|
||||
<< TwoLayerRMI<T>::Params::DEFAULT_SAMPLING_RATE << ")." << endl;
|
||||
}
|
||||
|
||||
if (this->hp.threshold <= 0 or this->hp.threshold >= INPUT_SZ or
|
||||
this->hp.threshold >= INPUT_SZ / this->hp.fanout) {
|
||||
this->hp.threshold = TwoLayerRMI<T>::Params::DEFAULT_THRESHOLD;
|
||||
cerr << "\33[93;1mWARNING\33[0m: Invalid threshold. Using default ("
|
||||
<< TwoLayerRMI<T>::Params::DEFAULT_THRESHOLD << ")." << endl;
|
||||
}
|
||||
|
||||
// Initialize the CDF model
|
||||
static const long NUM_LAYERS = 2;
|
||||
vector<vector<vector<training_point<T> > > > training_data(NUM_LAYERS);
|
||||
for (long layer_idx = 0; layer_idx < NUM_LAYERS; ++layer_idx) {
|
||||
training_data[layer_idx].resize(hp.num_leaf_models);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------//
|
||||
// SAMPLE //
|
||||
//----------------------------------------------------------//
|
||||
|
||||
// Determine sample size
|
||||
const long SAMPLE_SZ = std::min<long>(
|
||||
INPUT_SZ, std::max<long>(this->hp.sampling_rate * INPUT_SZ,
|
||||
TwoLayerRMI<T>::Params::MIN_SORTING_SIZE));
|
||||
|
||||
// Create a sample array
|
||||
this->training_sample.reserve(SAMPLE_SZ);
|
||||
|
||||
// Start sampling
|
||||
long offset = static_cast<long>(1. * INPUT_SZ / SAMPLE_SZ);
|
||||
for (auto i = begin; i < end; i += offset) {
|
||||
// NOTE: We don't directly assign SAMPLE_SZ to this->training_sample_sz
|
||||
// to avoid issues with divisibility
|
||||
this->training_sample.push_back(*i);
|
||||
}
|
||||
|
||||
// Sort the sample using the provided comparison function
|
||||
std::sort(this->training_sample.begin(), this->training_sample.end());
|
||||
|
||||
// Count the number of unique keys
|
||||
auto sample_cpy = this->training_sample;
|
||||
long num_unique_elms = std::distance(
|
||||
sample_cpy.begin(), std::unique(sample_cpy.begin(), sample_cpy.end()));
|
||||
|
||||
// Stop early if the array has very few unique values. We need at least 2
|
||||
// unique training examples per leaf model.
|
||||
if (num_unique_elms < 2 * this->hp.num_leaf_models) {
|
||||
return false;
|
||||
} else if (num_unique_elms > .9 * training_sample.size()) {
|
||||
this->enable_dups_detection = false;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------//
|
||||
// TRAIN THE MODELS //
|
||||
//----------------------------------------------------------//
|
||||
|
||||
// Populate the training data for the root model
|
||||
for (long i = 0; i < SAMPLE_SZ; ++i) {
|
||||
training_data[0][0].push_back(
|
||||
{this->training_sample[i], 1. * i / SAMPLE_SZ});
|
||||
}
|
||||
|
||||
// Train the root model using linear interpolation
|
||||
auto *current_training_data = &training_data[0][0];
|
||||
linear_model *current_model = &(this->root_model);
|
||||
|
||||
// Find the min and max values in the training set
|
||||
training_point<T> min = current_training_data->front();
|
||||
training_point<T> max = current_training_data->back();
|
||||
|
||||
// Calculate the slope and intercept terms, assuming min.y = 0 and max.y
|
||||
current_model->slope = 1. / (max.x - min.x);
|
||||
current_model->intercept = -current_model->slope * min.x;
|
||||
|
||||
// Extrapolate for the number of models in the next layer
|
||||
current_model->slope *= this->hp.num_leaf_models - 1;
|
||||
current_model->intercept *= this->hp.num_leaf_models - 1;
|
||||
|
||||
// Populate the training data for the next layer
|
||||
for (const auto &d : *current_training_data) {
|
||||
// Predict the model index in next layer
|
||||
long rank = current_model->slope * d.x + current_model->intercept;
|
||||
|
||||
// Normalize the rank between 0 and the number of models in the next layer
|
||||
rank = std::max(static_cast<long>(0),
|
||||
std::min(this->hp.num_leaf_models - 1, rank));
|
||||
|
||||
// Place the data in the predicted training bucket
|
||||
training_data[1][rank].push_back(d);
|
||||
}
|
||||
|
||||
// Train the leaf models
|
||||
for (long model_idx = 0; model_idx < this->hp.num_leaf_models;
|
||||
++model_idx) {
|
||||
// Update iterator variables
|
||||
current_training_data = &training_data[1][model_idx];
|
||||
current_model = &(this->leaf_models[model_idx]);
|
||||
|
||||
// Interpolate the min points in the training buckets
|
||||
if (model_idx == 0) {
|
||||
// The current model is the first model in the current layer
|
||||
|
||||
if (current_training_data->size() < 2) {
|
||||
// Case 1: The first model in this layer is empty
|
||||
current_model->slope = 0;
|
||||
current_model->intercept = 0;
|
||||
|
||||
// Insert a fictive training point to avoid propagating more than one
|
||||
// empty initial models.
|
||||
training_point<T> tp;
|
||||
tp.x = 0;
|
||||
tp.y = 0;
|
||||
current_training_data->push_back(tp);
|
||||
} else {
|
||||
// Case 2: The first model in this layer is not empty
|
||||
|
||||
min = current_training_data->front();
|
||||
max = current_training_data->back();
|
||||
|
||||
// Hallucinating as if min.y = 0
|
||||
current_model->slope = (1. * max.y) / (max.x - min.x);
|
||||
current_model->intercept = min.y - current_model->slope * min.x;
|
||||
}
|
||||
} else if (model_idx == this->hp.num_leaf_models - 1) {
|
||||
if (current_training_data->empty()) {
|
||||
// Case 3: The final model in this layer is empty
|
||||
|
||||
current_model->slope = 0;
|
||||
current_model->intercept = 1;
|
||||
} else {
|
||||
// Case 4: The last model in this layer is not empty
|
||||
|
||||
min = training_data[1][model_idx - 1].back();
|
||||
max = current_training_data->back();
|
||||
|
||||
// Hallucinating as if max.y = 1
|
||||
current_model->slope = (1. - min.y) / (max.x - min.x);
|
||||
current_model->intercept = min.y - current_model->slope * min.x;
|
||||
}
|
||||
} else {
|
||||
// The current model is not the first model in the current layer
|
||||
|
||||
if (current_training_data->empty()) {
|
||||
// Case 5: The intermediate model in this layer is empty
|
||||
current_model->slope = 0;
|
||||
current_model->intercept = training_data[1][model_idx - 1]
|
||||
.back()
|
||||
.y; // If the previous model
|
||||
// was empty too, it will
|
||||
// use the fictive
|
||||
// training points
|
||||
|
||||
// Insert a fictive training point to avoid propagating more than one
|
||||
// empty initial models.
|
||||
// NOTE: This will _NOT_ throw to DIV/0 due to identical x's and y's
|
||||
// because it is working backwards.
|
||||
training_point<T> tp;
|
||||
tp.x = training_data[1][model_idx - 1].back().x;
|
||||
tp.y = training_data[1][model_idx - 1].back().y;
|
||||
current_training_data->push_back(tp);
|
||||
} else {
|
||||
// Case 6: The intermediate leaf model is not empty
|
||||
|
||||
min = training_data[1][model_idx - 1].back();
|
||||
max = current_training_data->back();
|
||||
|
||||
current_model->slope = (max.y - min.y) / (max.x - min.x);
|
||||
current_model->intercept = min.y - current_model->slope * min.x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE:
|
||||
// The last stage (layer) of this model contains weights that predict the
|
||||
// CDF of the keys (i.e. Range is [0-1]) When using this model to predict
|
||||
// the position of the keys in the sorted order, you MUST scale the weights
|
||||
// of the last layer to whatever range you are predicting for. The inner
|
||||
// layers of the model have already been extrapolated to the length of the
|
||||
// stage.git
|
||||
//
|
||||
// This is a design choice to help with the portability of the model.
|
||||
//
|
||||
this->trained = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
} // namespace learned_sort
|
||||
|
||||
#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_ */
|
22
third_party/libcxx/ska_sort.cc
vendored
Normal file
22
third_party/libcxx/ska_sort.cc
vendored
Normal file
|
@ -0,0 +1,22 @@
|
|||
/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
|
||||
│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2023 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "third_party/libcxx/ska_sort.h"
|
||||
#include "third_party/libcxx/ska_sort.h"
|
||||
|
||||
void ska_sort_int64(int64_t* A, size_t n) { ska_sort(A, A + n); }
|
1457
third_party/libcxx/ska_sort.h
vendored
Normal file
1457
third_party/libcxx/ska_sort.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
42
third_party/libcxx/utils.h
vendored
Normal file
42
third_party/libcxx/utils.h
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
// -*- c++ -*-
|
||||
// clang-format off
|
||||
#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_
|
||||
#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_
|
||||
#include "third_party/libcxx/iterator"
|
||||
|
||||
namespace learned_sort {
|
||||
namespace utils {
|
||||
|
||||
// Helper functions for working with unsigned types
|
||||
template <typename T>
|
||||
inline T _get_key(T a) {
|
||||
return a;
|
||||
}
|
||||
|
||||
template <class RandomIt>
|
||||
void insertion_sort(RandomIt begin, RandomIt end) {
|
||||
// Determine the data type
|
||||
typedef typename std::iterator_traits<RandomIt>::value_type T;
|
||||
|
||||
// Determine the input size
|
||||
const size_t input_sz = std::distance(begin, end);
|
||||
|
||||
if (input_sz <= 0) return;
|
||||
|
||||
RandomIt cmp_idx;
|
||||
T key;
|
||||
for (auto i = begin + 1; i != end; ++i) {
|
||||
key = i[0];
|
||||
cmp_idx = i - 1;
|
||||
while (cmp_idx >= begin && cmp_idx[0] > key) {
|
||||
cmp_idx[1] = cmp_idx[0];
|
||||
--cmp_idx;
|
||||
}
|
||||
cmp_idx[1] = key;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
} // namespace learned_sort
|
||||
|
||||
#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_ */
|
Loading…
Reference in a new issue