cosmopolitan/third_party/libcxx/rmi.h
Justine Tunney 7f2bf3895e
Moar sorting
2023-05-11 09:09:46 -07:00

308 lines
11 KiB
C++

// -*- c++ -*-
// clang-format off
#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_
#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_
#include "third_party/libcxx/iostream"
#include "third_party/libcxx/vector"
using namespace std;
namespace learned_sort {
// Packs the key and its respective scaled CDF value
template <typename T>
struct training_point {
T x;
double y;
};
// Represents linear models
struct linear_model {
double slope = 0;
double intercept = 0;
};
// An implementation of a 2-layer RMI model
template <class T>
class TwoLayerRMI {
public:
// CDF model hyperparameters
struct Params {
// Member fields
long fanout;
float sampling_rate;
long threshold;
long num_leaf_models;
// Default hyperparameters
static constexpr long DEFAULT_FANOUT = 1e3;
static constexpr float DEFAULT_SAMPLING_RATE = .01;
static constexpr long DEFAULT_THRESHOLD = 100;
static constexpr long DEFAULT_NUM_LEAF_MODELS = 1000;
static constexpr long MIN_SORTING_SIZE = 1e4;
// Default constructor
Params() {
this->fanout = DEFAULT_FANOUT;
this->sampling_rate = DEFAULT_SAMPLING_RATE;
this->threshold = DEFAULT_THRESHOLD;
this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS;
}
// Constructor with custom hyperparameter values
Params(float sampling_rate, float overallocation, long fanout,
long batch_sz, long threshold) {
this->fanout = fanout;
this->sampling_rate = sampling_rate;
this->threshold = threshold;
this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS;
}
};
// Member variables of the CDF model
bool trained;
linear_model root_model;
vector<linear_model> leaf_models;
vector<T> training_sample;
Params hp;
bool enable_dups_detection;
// CDF model constructor
TwoLayerRMI(Params p) {
this->trained = false;
this->hp = p;
this->leaf_models.resize(p.num_leaf_models);
this->enable_dups_detection = true;
}
// Pretty-printing function
void print() {
printf("[0][0]: slope=%0.5f; intercept=%0.5f;\n", root_model.slope,
root_model.intercept);
for (int model_idx = 0; model_idx < hp.num_leaf_models; ++model_idx) {
printf("[%i][1]: slope=%0.5f; intercept=%0.5f;\n", model_idx,
leaf_models[model_idx].slope, leaf_models[model_idx].intercept);
}
cout << "-----------------------------" << endl;
}
/**
* @brief Train a CDF function with an RMI architecture, using linear spline
* interpolation.
*
* @param begin Random-access iterators to the initial position of the
* sequence to be used for sorting. The range used is [begin,end), which
* contains all the elements between first and last, including the element
* pointed by first but not the element pointed by last.
* @param end Random-access iterators to the last position of the sequence to
* be used for sorting. The range used is [begin,end), which contains all the
* elements between first and last, including the element pointed by first but
* not the element pointed by last.
* @return true if the model was trained successfully, false otherwise.
*/
bool train(typename vector<T>::iterator begin,
typename vector<T>::iterator end) {
// Determine input size
const long INPUT_SZ = std::distance(begin, end);
// Validate parameters
if (this->hp.fanout >= INPUT_SZ) {
this->hp.fanout = TwoLayerRMI<T>::Params::DEFAULT_FANOUT;
cerr << "\33[93;1mWARNING\33[0m: Invalid fanout. Using default ("
<< TwoLayerRMI<T>::Params::DEFAULT_FANOUT << ")." << endl;
}
if (this->hp.sampling_rate <= 0 or this->hp.sampling_rate > 1) {
this->hp.sampling_rate = TwoLayerRMI<T>::Params::DEFAULT_SAMPLING_RATE;
cerr << "\33[93;1mWARNING\33[0m: Invalid sampling rate. Using default ("
<< TwoLayerRMI<T>::Params::DEFAULT_SAMPLING_RATE << ")." << endl;
}
if (this->hp.threshold <= 0 or this->hp.threshold >= INPUT_SZ or
this->hp.threshold >= INPUT_SZ / this->hp.fanout) {
this->hp.threshold = TwoLayerRMI<T>::Params::DEFAULT_THRESHOLD;
cerr << "\33[93;1mWARNING\33[0m: Invalid threshold. Using default ("
<< TwoLayerRMI<T>::Params::DEFAULT_THRESHOLD << ")." << endl;
}
// Initialize the CDF model
static const long NUM_LAYERS = 2;
vector<vector<vector<training_point<T> > > > training_data(NUM_LAYERS);
for (long layer_idx = 0; layer_idx < NUM_LAYERS; ++layer_idx) {
training_data[layer_idx].resize(hp.num_leaf_models);
}
//----------------------------------------------------------//
// SAMPLE //
//----------------------------------------------------------//
// Determine sample size
const long SAMPLE_SZ = std::min<long>(
INPUT_SZ, std::max<long>(this->hp.sampling_rate * INPUT_SZ,
TwoLayerRMI<T>::Params::MIN_SORTING_SIZE));
// Create a sample array
this->training_sample.reserve(SAMPLE_SZ);
// Start sampling
long offset = static_cast<long>(1. * INPUT_SZ / SAMPLE_SZ);
for (auto i = begin; i < end; i += offset) {
// NOTE: We don't directly assign SAMPLE_SZ to this->training_sample_sz
// to avoid issues with divisibility
this->training_sample.push_back(*i);
}
// Sort the sample using the provided comparison function
std::sort(this->training_sample.begin(), this->training_sample.end());
// Count the number of unique keys
auto sample_cpy = this->training_sample;
long num_unique_elms = std::distance(
sample_cpy.begin(), std::unique(sample_cpy.begin(), sample_cpy.end()));
// Stop early if the array has very few unique values. We need at least 2
// unique training examples per leaf model.
if (num_unique_elms < 2 * this->hp.num_leaf_models) {
return false;
} else if (num_unique_elms > .9 * training_sample.size()) {
this->enable_dups_detection = false;
}
//----------------------------------------------------------//
// TRAIN THE MODELS //
//----------------------------------------------------------//
// Populate the training data for the root model
for (long i = 0; i < SAMPLE_SZ; ++i) {
training_data[0][0].push_back(
{this->training_sample[i], 1. * i / SAMPLE_SZ});
}
// Train the root model using linear interpolation
auto *current_training_data = &training_data[0][0];
linear_model *current_model = &(this->root_model);
// Find the min and max values in the training set
training_point<T> min = current_training_data->front();
training_point<T> max = current_training_data->back();
// Calculate the slope and intercept terms, assuming min.y = 0 and max.y
current_model->slope = 1. / (max.x - min.x);
current_model->intercept = -current_model->slope * min.x;
// Extrapolate for the number of models in the next layer
current_model->slope *= this->hp.num_leaf_models - 1;
current_model->intercept *= this->hp.num_leaf_models - 1;
// Populate the training data for the next layer
for (const auto &d : *current_training_data) {
// Predict the model index in next layer
long rank = current_model->slope * d.x + current_model->intercept;
// Normalize the rank between 0 and the number of models in the next layer
rank = std::max(static_cast<long>(0),
std::min(this->hp.num_leaf_models - 1, rank));
// Place the data in the predicted training bucket
training_data[1][rank].push_back(d);
}
// Train the leaf models
for (long model_idx = 0; model_idx < this->hp.num_leaf_models;
++model_idx) {
// Update iterator variables
current_training_data = &training_data[1][model_idx];
current_model = &(this->leaf_models[model_idx]);
// Interpolate the min points in the training buckets
if (model_idx == 0) {
// The current model is the first model in the current layer
if (current_training_data->size() < 2) {
// Case 1: The first model in this layer is empty
current_model->slope = 0;
current_model->intercept = 0;
// Insert a fictive training point to avoid propagating more than one
// empty initial models.
training_point<T> tp;
tp.x = 0;
tp.y = 0;
current_training_data->push_back(tp);
} else {
// Case 2: The first model in this layer is not empty
min = current_training_data->front();
max = current_training_data->back();
// Hallucinating as if min.y = 0
current_model->slope = (1. * max.y) / (max.x - min.x);
current_model->intercept = min.y - current_model->slope * min.x;
}
} else if (model_idx == this->hp.num_leaf_models - 1) {
if (current_training_data->empty()) {
// Case 3: The final model in this layer is empty
current_model->slope = 0;
current_model->intercept = 1;
} else {
// Case 4: The last model in this layer is not empty
min = training_data[1][model_idx - 1].back();
max = current_training_data->back();
// Hallucinating as if max.y = 1
current_model->slope = (1. - min.y) / (max.x - min.x);
current_model->intercept = min.y - current_model->slope * min.x;
}
} else {
// The current model is not the first model in the current layer
if (current_training_data->empty()) {
// Case 5: The intermediate model in this layer is empty
current_model->slope = 0;
current_model->intercept = training_data[1][model_idx - 1]
.back()
.y; // If the previous model
// was empty too, it will
// use the fictive
// training points
// Insert a fictive training point to avoid propagating more than one
// empty initial models.
// NOTE: This will _NOT_ throw to DIV/0 due to identical x's and y's
// because it is working backwards.
training_point<T> tp;
tp.x = training_data[1][model_idx - 1].back().x;
tp.y = training_data[1][model_idx - 1].back().y;
current_training_data->push_back(tp);
} else {
// Case 6: The intermediate leaf model is not empty
min = training_data[1][model_idx - 1].back();
max = current_training_data->back();
current_model->slope = (max.y - min.y) / (max.x - min.x);
current_model->intercept = min.y - current_model->slope * min.x;
}
}
}
// NOTE:
// The last stage (layer) of this model contains weights that predict the
// CDF of the keys (i.e. Range is [0-1]) When using this model to predict
// the position of the keys in the sorted order, you MUST scale the weights
// of the last layer to whatever range you are predicting for. The inner
// layers of the model have already been extrapolated to the length of the
// stage.git
//
// This is a design choice to help with the portability of the model.
//
this->trained = true;
return true;
}
};
} // namespace learned_sort
#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_ */