Moar sorting

2025-08-03 00:10:31 +00:00 · 2023-04-27 20:36:25 -07:00 · 2023-04-27 20:36:25 -07:00 · 7f2bf3895e
commit 7f2bf3895e
parent dbb8d66f22
11 changed files with 3627 additions and 9 deletions
--- a/third_party/libcxx/rmi.h
+++ b/third_party/libcxx/rmi.h
@ -0,0 +1,308 @@
+// -*- c++ -*-
+// clang-format off
+#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_
+#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_
+#include "third_party/libcxx/iostream"
+#include "third_party/libcxx/vector"
+
+using namespace std;
+
+namespace learned_sort {
+
+// Packs the key and its respective scaled CDF value
+template <typename T>
+struct training_point {
+  T x;
+  double y;
+};
+
+// Represents linear models
+struct linear_model {
+  double slope = 0;
+  double intercept = 0;
+};
+
+// An implementation of a 2-layer RMI model
+template <class T>
+class TwoLayerRMI {
+ public:
+  // CDF model hyperparameters
+  struct Params {
+    // Member fields
+    long fanout;
+    float sampling_rate;
+    long threshold;
+    long num_leaf_models;
+
+    // Default hyperparameters
+    static constexpr long DEFAULT_FANOUT = 1e3;
+    static constexpr float DEFAULT_SAMPLING_RATE = .01;
+    static constexpr long DEFAULT_THRESHOLD = 100;
+    static constexpr long DEFAULT_NUM_LEAF_MODELS = 1000;
+    static constexpr long MIN_SORTING_SIZE = 1e4;
+
+    // Default constructor
+    Params() {
+      this->fanout = DEFAULT_FANOUT;
+      this->sampling_rate = DEFAULT_SAMPLING_RATE;
+      this->threshold = DEFAULT_THRESHOLD;
+      this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS;
+    }
+
+    // Constructor with custom hyperparameter values
+    Params(float sampling_rate, float overallocation, long fanout,
+           long batch_sz, long threshold) {
+      this->fanout = fanout;
+      this->sampling_rate = sampling_rate;
+      this->threshold = threshold;
+      this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS;
+    }
+  };
+
+  // Member variables of the CDF model
+  bool trained;
+  linear_model root_model;
+  vector<linear_model> leaf_models;
+  vector<T> training_sample;
+  Params hp;
+  bool enable_dups_detection;
+
+  // CDF model constructor
+  TwoLayerRMI(Params p) {
+    this->trained = false;
+    this->hp = p;
+    this->leaf_models.resize(p.num_leaf_models);
+    this->enable_dups_detection = true;
+  }
+
+  // Pretty-printing function
+  void print() {
+    printf("[0][0]: slope=%0.5f; intercept=%0.5f;\n", root_model.slope,
+           root_model.intercept);
+    for (int model_idx = 0; model_idx < hp.num_leaf_models; ++model_idx) {
+      printf("[%i][1]: slope=%0.5f; intercept=%0.5f;\n", model_idx,
+             leaf_models[model_idx].slope, leaf_models[model_idx].intercept);
+    }
+    cout << "-----------------------------" << endl;
+  }
+
+  /**
+   * @brief Train a CDF function with an RMI architecture, using linear spline
+   * interpolation.
+   *
+   * @param begin Random-access iterators to the initial position of the
+   * sequence to be used for sorting. The range used is [begin,end), which
+   * contains all the elements between first and last, including the element
+   * pointed by first but not the element pointed by last.
+   * @param end Random-access iterators to the last position of the sequence to
+   * be used for sorting. The range used is [begin,end), which contains all the
+   * elements between first and last, including the element pointed by first but
+   * not the element pointed by last.
+   * @return true if the model was trained successfully, false otherwise.
+   */
+  bool train(typename vector<T>::iterator begin,
+             typename vector<T>::iterator end) {
+    // Determine input size
+    const long INPUT_SZ = std::distance(begin, end);
+
+    // Validate parameters
+    if (this->hp.fanout >= INPUT_SZ) {
+      this->hp.fanout = TwoLayerRMI<T>::Params::DEFAULT_FANOUT;
+      cerr << "\33[93;1mWARNING\33[0m: Invalid fanout. Using default ("
+           << TwoLayerRMI<T>::Params::DEFAULT_FANOUT << ")." << endl;
+    }
+
+    if (this->hp.sampling_rate <= 0 or this->hp.sampling_rate > 1) {
+      this->hp.sampling_rate = TwoLayerRMI<T>::Params::DEFAULT_SAMPLING_RATE;
+      cerr << "\33[93;1mWARNING\33[0m: Invalid sampling rate. Using default ("
+           << TwoLayerRMI<T>::Params::DEFAULT_SAMPLING_RATE << ")." << endl;
+    }
+
+    if (this->hp.threshold <= 0 or this->hp.threshold >= INPUT_SZ or
+        this->hp.threshold >= INPUT_SZ / this->hp.fanout) {
+      this->hp.threshold = TwoLayerRMI<T>::Params::DEFAULT_THRESHOLD;
+      cerr << "\33[93;1mWARNING\33[0m: Invalid threshold. Using default ("
+           << TwoLayerRMI<T>::Params::DEFAULT_THRESHOLD << ")." << endl;
+    }
+
+    // Initialize the CDF model
+    static const long NUM_LAYERS = 2;
+    vector<vector<vector<training_point<T> > > > training_data(NUM_LAYERS);
+    for (long layer_idx = 0; layer_idx < NUM_LAYERS; ++layer_idx) {
+      training_data[layer_idx].resize(hp.num_leaf_models);
+    }
+
+    //----------------------------------------------------------//
+    //                           SAMPLE                         //
+    //----------------------------------------------------------//
+
+    // Determine sample size
+    const long SAMPLE_SZ = std::min<long>(
+        INPUT_SZ, std::max<long>(this->hp.sampling_rate * INPUT_SZ,
+                                 TwoLayerRMI<T>::Params::MIN_SORTING_SIZE));
+
+    // Create a sample array
+    this->training_sample.reserve(SAMPLE_SZ);
+
+    // Start sampling
+    long offset = static_cast<long>(1. * INPUT_SZ / SAMPLE_SZ);
+    for (auto i = begin; i < end; i += offset) {
+      // NOTE:  We don't directly assign SAMPLE_SZ to this->training_sample_sz
+      //        to avoid issues with divisibility
+      this->training_sample.push_back(*i);
+    }
+
+    // Sort the sample using the provided comparison function
+    std::sort(this->training_sample.begin(), this->training_sample.end());
+
+    // Count the number of unique keys
+    auto sample_cpy = this->training_sample;
+    long num_unique_elms = std::distance(
+        sample_cpy.begin(), std::unique(sample_cpy.begin(), sample_cpy.end()));
+
+    // Stop early if the array has very few unique values. We need at least 2
+    // unique training examples per leaf model.
+    if (num_unique_elms < 2 * this->hp.num_leaf_models) {
+      return false;
+    } else if (num_unique_elms > .9 * training_sample.size()) {
+      this->enable_dups_detection = false;
+    }
+
+    //----------------------------------------------------------//
+    //                     TRAIN THE MODELS                     //
+    //----------------------------------------------------------//
+
+    // Populate the training data for the root model
+    for (long i = 0; i < SAMPLE_SZ; ++i) {
+      training_data[0][0].push_back(
+          {this->training_sample[i], 1. * i / SAMPLE_SZ});
+    }
+
+    // Train the root model using linear interpolation
+    auto *current_training_data = &training_data[0][0];
+    linear_model *current_model = &(this->root_model);
+
+    // Find the min and max values in the training set
+    training_point<T> min = current_training_data->front();
+    training_point<T> max = current_training_data->back();
+
+    // Calculate the slope and intercept terms, assuming min.y = 0 and max.y
+    current_model->slope = 1. / (max.x - min.x);
+    current_model->intercept = -current_model->slope * min.x;
+
+    // Extrapolate for the number of models in the next layer
+    current_model->slope *= this->hp.num_leaf_models - 1;
+    current_model->intercept *= this->hp.num_leaf_models - 1;
+
+    // Populate the training data for the next layer
+    for (const auto &d : *current_training_data) {
+      // Predict the model index in next layer
+      long rank = current_model->slope * d.x + current_model->intercept;
+
+      // Normalize the rank between 0 and the number of models in the next layer
+      rank = std::max(static_cast<long>(0),
+                      std::min(this->hp.num_leaf_models - 1, rank));
+
+      // Place the data in the predicted training bucket
+      training_data[1][rank].push_back(d);
+    }
+
+    // Train the leaf models
+    for (long model_idx = 0; model_idx < this->hp.num_leaf_models;
+         ++model_idx) {
+      // Update iterator variables
+      current_training_data = &training_data[1][model_idx];
+      current_model = &(this->leaf_models[model_idx]);
+
+      // Interpolate the min points in the training buckets
+      if (model_idx == 0) {
+        // The current model is the first model in the current layer
+
+        if (current_training_data->size() < 2) {
+          // Case 1: The first model in this layer is empty
+          current_model->slope = 0;
+          current_model->intercept = 0;
+
+          // Insert a fictive training point to avoid propagating more than one
+          // empty initial models.
+          training_point<T> tp;
+          tp.x = 0;
+          tp.y = 0;
+          current_training_data->push_back(tp);
+        } else {
+          // Case 2: The first model in this layer is not empty
+
+          min = current_training_data->front();
+          max = current_training_data->back();
+
+          // Hallucinating as if min.y = 0
+          current_model->slope = (1. * max.y) / (max.x - min.x);
+          current_model->intercept = min.y - current_model->slope * min.x;
+        }
+      } else if (model_idx == this->hp.num_leaf_models - 1) {
+        if (current_training_data->empty()) {
+          // Case 3: The final model in this layer is empty
+
+          current_model->slope = 0;
+          current_model->intercept = 1;
+        } else {
+          // Case 4: The last model in this layer is not empty
+
+          min = training_data[1][model_idx - 1].back();
+          max = current_training_data->back();
+
+          // Hallucinating as if max.y = 1
+          current_model->slope = (1. - min.y) / (max.x - min.x);
+          current_model->intercept = min.y - current_model->slope * min.x;
+        }
+      } else {
+        // The current model is not the first model in the current layer
+
+        if (current_training_data->empty()) {
+          // Case 5: The intermediate model in this layer is empty
+          current_model->slope = 0;
+          current_model->intercept = training_data[1][model_idx - 1]
+                                         .back()
+                                         .y;  // If the previous model
+                                              // was empty too, it will
+                                              // use the fictive
+                                              // training points
+
+          // Insert a fictive training point to avoid propagating more than one
+          // empty initial models.
+          // NOTE: This will _NOT_ throw to DIV/0 due to identical x's and y's
+          // because it is working backwards.
+          training_point<T> tp;
+          tp.x = training_data[1][model_idx - 1].back().x;
+          tp.y = training_data[1][model_idx - 1].back().y;
+          current_training_data->push_back(tp);
+        } else {
+          // Case 6: The intermediate leaf model is not empty
+
+          min = training_data[1][model_idx - 1].back();
+          max = current_training_data->back();
+
+          current_model->slope = (max.y - min.y) / (max.x - min.x);
+          current_model->intercept = min.y - current_model->slope * min.x;
+        }
+      }
+    }
+
+    // NOTE:
+    // The last stage (layer) of this model contains weights that predict the
+    // CDF of the keys (i.e. Range is [0-1]) When using this model to predict
+    // the position of the keys in the sorted order, you MUST scale the weights
+    // of the last layer to whatever range you are predicting for. The inner
+    // layers of the model have already been extrapolated to the length of the
+    // stage.git
+    //
+    // This is a design choice to help with the portability of the model.
+    //
+    this->trained = true;
+
+    return true;
+  }
+};
+}  // namespace learned_sort
+
+#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_ */