Import C++ Standard Template Library

You can now use the hardest fastest and most dangerous language there is with Cosmopolitan. So far about 75% of LLVM libcxx has been added. A few breaking changes needed to be made to help this go smoothly. - Rename nothrow to dontthrow - Rename nodiscard to dontdiscard - Add some libm functions, e.g. lgamma, nan, etc. - Change intmax_t from int128 to int64 like everything else - Introduce %jjd formatting directive for int128_t - Introduce strtoi128(), strtou128(), etc. - Rename bsrmax() to bsr128() Some of the templates that should be working currently are std::vector, std::string, std::map, std::set, std::deque, etc.
2025-07-31 15:00:28 +00:00 · 2022-03-22 05:51:41 -07:00 · 2022-03-22 05:51:41 -07:00 · 868af3f950
commit 868af3f950
parent 5022f9e920
286 changed files with 123987 additions and 507 deletions
--- a/third_party/smallz4/smallz4.hh
+++ b/third_party/smallz4/smallz4.hh
@ -0,0 +1,807 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_SMALLZ4_SMALLZ4_H_
+#define COSMOPOLITAN_THIRD_PARTY_SMALLZ4_SMALLZ4_H_
+#include "third_party/libcxx/vector"
+
+/**
+ * LZ4 compression with optimal parsing
+ *
+ * See smallz4.cc for a basic I/O interface you can easily replace it by
+ * a in-memory version then all you have to do is:
+ *
+ *     smallz4::lz4(GET_BYTES, SEND_BYTES);
+ *
+ * For more advanced stuff, you can call lz4 with up to four parameters
+ * (incl. max chain length and a dictionary)
+ */
+class smallz4 {
+ public:
+  // read  several bytes, see getBytesFromIn() in smallz4.cpp for a basic
+  // implementation
+  typedef size_t (*GET_BYTES)(void* data, size_t numBytes, void* userPtr);
+  // write several bytes, see sendBytesToOut() in smallz4.cpp for a basic
+  // implementation
+  typedef void (*SEND_BYTES)(const void* data, size_t numBytes, void* userPtr);
+
+  /// compress everything in input stream (accessed via getByte) and write to
+  /// output stream (via send)
+  static void lz4(GET_BYTES getBytes, SEND_BYTES sendBytes,
+                  unsigned short maxChainLength = MaxChainLength,
+                  bool useLegacyFormat = false, void* userPtr = NULL) {
+    lz4(getBytes, sendBytes, maxChainLength, std::vector<unsigned char>(),
+        useLegacyFormat, userPtr);
+  }
+
+  /// compress everything in input stream (accessed via getByte) and write to
+  /// output stream (via send)
+  static void lz4(
+      GET_BYTES getBytes, SEND_BYTES sendBytes, unsigned short maxChainLength,
+      const std::vector<unsigned char>& dictionary,  // predefined dictionary
+      bool useLegacyFormat =
+          false,  // old format is 7 bytes smaller if input < 8 MB
+      void* userPtr = NULL) {
+    smallz4 obj(maxChainLength);
+    obj.compress(getBytes, sendBytes, dictionary, useLegacyFormat, userPtr);
+  }
+
+  /// version string
+  static const char* const getVersion() {
+    return "1.5";
+  }
+
+  // compression level thresholds, made public because I display them in the
+  // help screen ...
+  enum {
+    /// greedy mode for short chains (compression level <= 3) instead of optimal
+    /// parsing / lazy evaluation
+    ShortChainsGreedy = 3,
+    /// lazy evaluation for medium-sized chains (compression level > 3 and <= 6)
+    ShortChainsLazy = 6
+  };
+
+  // ----- END OF PUBLIC INTERFACE -----
+ private:
+  // ----- constants and types -----
+
+  /// a block can be up to 4 MB, so uint32_t would suffice but uint64_t is quite
+  /// a bit faster on my x64 machine
+  typedef uint64_t Length;
+  /// matches must start within the most recent 64k
+  typedef uint16_t Distance;
+
+  enum {
+    /// each match's length must be >= 4
+    MinMatch = 4,
+    /// a literal needs one byte
+    JustLiteral = 1,
+    /// last match must not be closer than 12 bytes to the end
+    BlockEndNoMatch = 12,
+    /// last 5 bytes must be literals, no matching allowed
+    BlockEndLiterals = 5,
+
+    /// match finder's hash table size (2^HashBits entries, must be less than
+    /// 32)
+    HashBits = 20,
+    HashSize = 1 << HashBits,
+
+    /// input buffer size, can be any number but zero ;-)
+    BufferSize = 1024,
+
+    /// maximum match distance, must be power of 2 minus 1
+    MaxDistance = 65535,
+    /// marker for "no match"
+    EndOfChain = 0,
+    /// stop match finding after MaxChainLength steps (default is unlimited =>
+    /// optimal parsing)
+    MaxChainLength = MaxDistance,
+
+    /// significantly speed up parsing if the same byte is repeated a lot, may
+    /// cause sub-optimal compression
+    MaxSameLetter = 19 + 255 * 256,  // was: 19 + 255,
+
+    /// maximum block size as defined in LZ4 spec: {
+    /// 0,0,0,0,64*1024,256*1024,1024*1024,4*1024*1024 } I only work with the
+    /// biggest maximum block size (7)
+    //  note: xxhash header checksum is precalculated only for 7, too
+    MaxBlockSizeId = 7,
+    MaxBlockSize = 4 * 1024 * 1024,
+
+    /// legacy format has a fixed block size of 8 MB
+    MaxBlockSizeLegacy = 8 * 1024 * 1024,
+
+    /// number of literals and match length is encoded in several bytes, max.
+    /// 255 per byte
+    MaxLengthCode = 255
+  };
+
+  //  ----- one and only variable ... -----
+
+  /// how many matches are checked in findLongestMatch, lower values yield
+  /// faster encoding at the cost of worse compression ratio
+  unsigned short maxChainLength;
+
+  //  ----- code -----
+
+  /// match
+  struct Match {
+    /// length of match
+    Length length;
+    /// start of match
+    Distance distance;
+  };
+
+  /// create new compressor (only invoked by lz4)
+  explicit smallz4(unsigned short newMaxChainLength = MaxChainLength)
+      : maxChainLength(newMaxChainLength)  // => no limit, but can be changed by
+                                           // setMaxChainLength
+  {
+  }
+
+  /// return true, if the four bytes at *a and *b match
+  inline static bool match4(const void* const a, const void* const b) {
+    return *(const uint32_t*)a == *(const uint32_t*)b;
+  }
+
+  /// simple hash function, input: 32 bits, output: HashBits bits (by default:
+  /// 20)
+  inline static uint32_t getHash32(uint32_t fourBytes) {
+    // taken from https://en.wikipedia.org/wiki/Linear_congruential_generator
+    const uint32_t HashMultiplier = 48271;
+    return ((fourBytes * HashMultiplier) >> (32 - HashBits)) & (HashSize - 1);
+  }
+
+  /// find longest match of data[pos] between data[begin] and data[end], use
+  /// match chain
+  Match findLongestMatch(const unsigned char* const data, uint64_t pos,
+                         uint64_t begin, uint64_t end,
+                         const Distance* const chain) const {
+    Match result;
+    result.length = JustLiteral;  // assume a literal => one byte
+
+    // compression level: look only at the first n entries of the match chain
+    unsigned short stepsLeft = maxChainLength;
+    // findLongestMatch() shouldn't be called when maxChainLength = 0
+    // (uncompressed)
+
+    // pointer to position that is currently analyzed (which we try to find a
+    // great match for)
+    const unsigned char* const current = data + pos - begin;
+    // don't match beyond this point
+    const unsigned char* const stop = current + end - pos;
+
+    // get distance to previous match, abort if 0 => not existing
+    Distance distance = chain[pos & MaxDistance];
+    int64_t totalDistance = 0;
+    while (distance != EndOfChain) {
+      // chain goes too far back ?
+      totalDistance += distance;
+      if (totalDistance > MaxDistance) break;  // can't match beyond 64k
+
+      // prepare next position
+      distance = chain[(pos - totalDistance) & MaxDistance];
+
+      // let's introduce a new pointer atLeast that points to the first "new"
+      // byte of a potential longer match
+      const unsigned char* const atLeast = current + result.length + 1;
+      // impossible to find a longer match because not enough bytes left ?
+      if (atLeast > stop) break;
+
+      // the idea is to split the comparison algorithm into 2 phases
+      // (1) scan backward from atLeast to current, abort if mismatch
+      // (2) scan forward  until a mismatch is found and store length/distance
+      // of this new best match current                  atLeast
+      //    |                        |
+      //    -<<<<<<<< phase 1 <<<<<<<<
+      //                              >>> phase 2 >>>
+      // main reason for phase 1:
+      // - both byte sequences start with the same bytes, quite likely they are
+      // very similar
+      // - there is a good chance that if they differ, then their last bytes
+      // differ
+      // => checking the last first increases the probability that a mismatch is
+      // detected as early as possible
+
+      // compare 4 bytes at once
+      const Length CheckAtOnce = 4;
+
+      // all bytes between current and atLeast shall be identical
+      const unsigned char* phase1 =
+          atLeast - CheckAtOnce;  // minus 4 because match4 checks 4 bytes
+      while (phase1 > current && match4(phase1, phase1 - totalDistance))
+        phase1 -= CheckAtOnce;
+      // note: - the first four bytes always match
+      //       - in the last iteration, phase1 points either at current + 1 or
+      //       current + 2 or current + 3
+      //       - therefore we compare a few bytes twice => but a check to skip
+      //       these checks is more expensive
+
+      // mismatch ? (the while-loop was aborted)
+      if (phase1 > current) continue;
+
+      // we have a new best match, now scan forward
+      const unsigned char* phase2 = atLeast;
+
+      // fast loop: check four bytes at once
+      while (phase2 + CheckAtOnce <= stop &&
+             match4(phase2, phase2 - totalDistance))
+        phase2 += CheckAtOnce;
+      // slow loop: check the last 1/2/3 bytes
+      while (phase2 < stop && *phase2 == *(phase2 - totalDistance)) phase2++;
+
+      // store new best match
+      result.distance = Distance(totalDistance);
+      result.length = Length(phase2 - current);
+
+      // stop searching on lower compression levels
+      if (--stepsLeft == 0) break;
+    }
+
+    return result;
+  }
+
+  /// create shortest output
+  /** data points to block's begin; we need it to extract literals **/
+  static std::vector<unsigned char> selectBestMatches(
+      const std::vector<Match>& matches, const unsigned char* const data) {
+    // store encoded data
+    std::vector<unsigned char> result;
+    result.reserve(matches.size());
+
+    // indices of current run of literals
+    size_t literalsFrom = 0;
+    size_t numLiterals = 0;
+
+    bool lastToken = false;
+
+    // walk through the whole block
+    for (size_t offset = 0;
+         offset < matches.size();)  // increment inside of loop
+    {
+      // get best cost-weighted match
+      Match match = matches[offset];
+
+      // if no match, then count literals instead
+      if (match.length <= JustLiteral) {
+        // first literal ? need to reset pointers of current sequence of
+        // literals
+        if (numLiterals == 0) literalsFrom = offset;
+
+        // add one more literal to current sequence
+        numLiterals++;
+
+        // next match
+        offset++;
+
+        // continue unless it's the last literal
+        if (offset < matches.size()) continue;
+
+        lastToken = true;
+      } else {
+        // skip unused matches
+        offset += match.length;
+      }
+
+      // store match length (4 is implied because it's the minimum match length)
+      int matchLength = int(match.length) - MinMatch;
+
+      // last token has zero length
+      if (lastToken) matchLength = 0;
+
+      // token consists of match length and number of literals, let's start with
+      // match length ...
+      unsigned char token =
+          (matchLength < 15) ? (unsigned char)matchLength : 15;
+
+      // >= 15 literals ? (extra bytes to store length)
+      if (numLiterals < 15) {
+        // add number of literals in higher four bits
+        token |= numLiterals << 4;
+        result.push_back(token);
+      } else {
+        // set all higher four bits, the following bytes with determine the
+        // exact number of literals
+        result.push_back(token | 0xF0);
+
+        // 15 is already encoded in token
+        int encodeNumLiterals = int(numLiterals) - 15;
+
+        // emit 255 until remainder is below 255
+        while (encodeNumLiterals >= MaxLengthCode) {
+          result.push_back(MaxLengthCode);
+          encodeNumLiterals -= MaxLengthCode;
+        }
+        // and the last byte (can be zero, too)
+        result.push_back((unsigned char)encodeNumLiterals);
+      }
+      // copy literals
+      if (numLiterals > 0) {
+        result.insert(result.end(), data + literalsFrom,
+                      data + literalsFrom + numLiterals);
+
+        // last token doesn't have a match
+        if (lastToken) break;
+
+        // reset
+        numLiterals = 0;
+      }
+
+      // distance stored in 16 bits / little endian
+      result.push_back(match.distance & 0xFF);
+      result.push_back(match.distance >> 8);
+
+      // >= 15+4 bytes matched
+      if (matchLength >= 15) {
+        // 15 is already encoded in token
+        matchLength -= 15;
+        // emit 255 until remainder is below 255
+        while (matchLength >= MaxLengthCode) {
+          result.push_back(MaxLengthCode);
+          matchLength -= MaxLengthCode;
+        }
+        // and the last byte (can be zero, too)
+        result.push_back((unsigned char)matchLength);
+      }
+    }
+
+    return result;
+  }
+
+  /// walk backwards through all matches and compute number of compressed bytes
+  /// from current position to the end of the block
+  /** note: matches are modified (shortened length) if necessary **/
+  static void estimateCosts(std::vector<Match>& matches) {
+    const size_t blockEnd = matches.size();
+
+    // equals the number of bytes after compression
+    typedef uint32_t Cost;
+    // minimum cost from this position to the end of the current block
+    std::vector<Cost> cost(matches.size(), 0);
+    // "cost" represents the number of bytes needed
+
+    // the last bytes must always be literals
+    Length numLiterals = BlockEndLiterals;
+    // backwards optimal parsing
+    for (int64_t i = (int64_t)blockEnd - (1 + BlockEndLiterals); i >= 0;
+         i--)  // ignore the last 5 bytes, they are always literals
+    {
+      // if encoded as a literal
+      numLiterals++;
+      Length bestLength = JustLiteral;
+      // such a literal "costs" 1 byte
+      Cost minCost = cost[i + 1] + JustLiteral;
+
+      // an extra length byte is required for every 255 literals
+      if (numLiterals >= 15) {
+        // same as: if ((numLiterals - 15) % MaxLengthCode == 0)
+        // but I try hard to avoid the slow modulo function
+        if (numLiterals == 15 || (numLiterals >= 15 + MaxLengthCode &&
+                                  (numLiterals - 15) % MaxLengthCode == 0))
+          minCost++;
+      }
+
+      // let's look at the longest match, almost always more efficient that the
+      // plain literals
+      Match match = matches[i];
+
+      // very long self-referencing matches can slow down the program A LOT
+      if (match.length >= MaxSameLetter && match.distance == 1) {
+        // assume that longest match is always the best match
+        // NOTE: this assumption might not be optimal !
+        bestLength = match.length;
+        minCost =
+            cost[i + match.length] + 1 + 2 + 1 + Cost(match.length - 19) / 255;
+      } else {
+        // this is the core optimization loop
+
+        // overhead of encoding a match: token (1 byte) + offset (2 bytes) +
+        // sometimes extra bytes for long matches
+        Cost extraCost = 1 + 2;
+        Length nextCostIncrease = 18;  // need one more byte for 19+ long
+                                       // matches (next increase: 19+255*x)
+
+        // try all match lengths (start with short ones)
+        for (Length length = MinMatch; length <= match.length; length++) {
+          // token (1 byte) + offset (2 bytes) + extra bytes for long matches
+          Cost currentCost = cost[i + length] + extraCost;
+          // better choice ?
+          if (currentCost <= minCost) {
+            // regarding the if-condition:
+            // "<"  prefers literals and shorter matches
+            // "<=" prefers longer matches
+            // they should produce the same number of bytes (because of the same
+            // cost)
+            // ... but every now and then it doesn't !
+            // that's why: too many consecutive literals require an extra length
+            // byte (which we took into consideration a few lines above) but we
+            // only looked at literals beyond the current position if there are
+            // many literal in front of the current position then it may be
+            // better to emit a match with the same cost as the literals at the
+            // current position
+            // => it "breaks" the long chain of literals and removes the extra
+            // length byte
+            minCost = currentCost;
+            bestLength = length;
+            // performance-wise, a long match is usually faster during decoding
+            // than multiple short matches on the other hand, literals are
+            // faster than short matches as well (assuming same cost)
+          }
+
+          // very long matches need extra bytes for encoding match length
+          if (length == nextCostIncrease) {
+            extraCost++;
+            nextCostIncrease += MaxLengthCode;
+          }
+        }
+      }
+
+      // store lowest cost so far
+      cost[i] = minCost;
+
+      // and adjust best match
+      matches[i].length = bestLength;
+
+      // reset number of literals if a match was chosen
+      if (bestLength != JustLiteral) numLiterals = 0;
+
+      // note: if bestLength is smaller than the previous matches[i].length then
+      // there might be a closer match
+      //       which could be more cache-friendly (=> faster decoding)
+    }
+  }
+
+  /// compress everything in input stream (accessed via getByte) and write to
+  /// output stream (via send), improve compression with a predefined dictionary
+  void compress(GET_BYTES getBytes, SEND_BYTES sendBytes,
+                const std::vector<unsigned char>& dictionary,
+                bool useLegacyFormat, void* userPtr) const {
+    // ==================== write header ====================
+    if (useLegacyFormat) {
+      // magic bytes
+      const unsigned char header[] = {0x02, 0x21, 0x4C, 0x18};
+      sendBytes(header, sizeof(header), userPtr);
+    } else {
+      // frame header
+      const unsigned char header[] = {
+          0x04, 0x22, 0x4D,
+          0x18,    // magic bytes
+          1 << 6,  // flags: no checksums, blocks depend on each other and no
+                   // dictionary ID
+          MaxBlockSizeId << 4,  // max blocksize
+          0xDF                  // header checksum (precomputed)
+      };
+      sendBytes(header, sizeof(header), userPtr);
+    }
+
+    // ==================== declarations ====================
+    // change read buffer size as you like
+    unsigned char buffer[BufferSize];
+
+    // read the file in chunks/blocks, data will contain only bytes which are
+    // relevant for the current block
+    std::vector<unsigned char> data;
+
+    // file position corresponding to data[0]
+    size_t dataZero = 0;
+    // last already read position
+    size_t numRead = 0;
+
+    // passthru data ? (but still wrap it in LZ4 format)
+    const bool uncompressed = (maxChainLength == 0);
+
+    // last time we saw a hash
+    const uint64_t NoLastHash = ~0;  // = -1
+    std::vector<uint64_t> lastHash(HashSize, NoLastHash);
+
+    // previous position which starts with the same bytes
+    std::vector<Distance> previousHash(
+        MaxDistance + 1,
+        Distance(EndOfChain));  // long chains based on my simple hash
+    std::vector<Distance> previousExact(
+        MaxDistance + 1,
+        Distance(EndOfChain));  // shorter chains based on exact matching of the
+                                // first four bytes
+    // these two containers are essential for match finding:
+    // 1. I compute a hash of four byte
+    // 2. in lastHash is the location of the most recent block of four byte with
+    // that same hash
+    // 3. due to hash collisions, several groups of four bytes may yield the
+    // same hash
+    // 4. so for each location I can look up the previous location of the same
+    // hash in previousHash
+    // 5. basically it's a chain of memory locations where potential matches
+    // start
+    // 5. I follow this hash chain until I find exactly the same four bytes I
+    // was looking for
+    // 6. then I switch to a sparser chain: previousExact
+    // 7. it's basically the same idea as previousHash but this time not the
+    // hash but the first four bytes must be identical
+    // 8. previousExact will be used by findLongestMatch: it compare all such
+    // strings a figures out which is the longest match
+
+    // And why do I have to do it in such a complicated way ?
+    // - well, there are 2^32 combinations of four bytes
+    // - so that there are 2^32 potential chains
+    // - most combinations just don't occur and occupy no space but I still have
+    // to keep their "entry point" (which are empty/invalid)
+    // - that would be at least 16 GBytes RAM (2^32 x 4 bytes)
+    // - my hashing algorithm reduces the 2^32 combinations to 2^20 hashes (see
+    // hashBits), that's about 8 MBytes RAM
+    // - thus only 2^20 entry points and at most 2^20 hash chains which is
+    // easily manageable
+    // ... in the end it's all about conserving memory !
+    // (total memory consumption of smallz4 is about 64 MBytes)
+
+    // first and last offset of a block (nextBlock is end-of-block plus 1)
+    uint64_t lastBlock = 0;
+    uint64_t nextBlock = 0;
+    bool parseDictionary = !dictionary.empty();
+
+    // main loop, processes one block per iteration
+    while (true) {
+      // ==================== start new block ====================
+      // first byte of the currently processed block (std::vector data may
+      // contain the last 64k of the previous block, too)
+      const unsigned char* dataBlock = NULL;
+
+      // prepend dictionary
+      if (parseDictionary) {
+        // resize dictionary to 64k (minus 1 because we can only match the last
+        // 65535 bytes of the dictionary => MaxDistance)
+        if (dictionary.size() < MaxDistance) {
+          // dictionary is smaller than 64k, prepend garbage data
+          size_t unused = MaxDistance - dictionary.size();
+          data.resize(unused, 0);
+          data.insert(data.end(), dictionary.begin(), dictionary.end());
+        } else
+          // copy only the most recent 64k of the dictionary
+          data.insert(data.end(),
+                      dictionary.begin() + dictionary.size() - MaxDistance,
+                      dictionary.end());
+
+        nextBlock = data.size();
+        numRead = data.size();
+      }
+
+      // read more bytes from input
+      size_t maxBlockSize = useLegacyFormat ? MaxBlockSizeLegacy : MaxBlockSize;
+      while (numRead - nextBlock < maxBlockSize) {
+        // buffer can be significantly smaller than MaxBlockSize, that's the
+        // only reason for this while-block
+        size_t incoming = getBytes(buffer, BufferSize, userPtr);
+        // no more data ?
+        if (incoming == 0) break;
+
+        // add bytes to buffer
+        numRead += incoming;
+        data.insert(data.end(), buffer, buffer + incoming);
+      }
+
+      // no more data ? => WE'RE DONE !
+      if (nextBlock == numRead) break;
+
+      // determine block borders
+      lastBlock = nextBlock;
+      nextBlock += maxBlockSize;
+      // not beyond end-of-file
+      if (nextBlock > numRead) nextBlock = numRead;
+
+      // pointer to first byte of the currently processed block (the std::vector
+      // container named data may contain the last 64k of the previous block,
+      // too)
+      dataBlock = &data[lastBlock - dataZero];
+
+      const uint64_t blockSize = nextBlock - lastBlock;
+
+      // ==================== full match finder ====================
+
+      // greedy mode is much faster but produces larger output
+      const bool isGreedy = (maxChainLength <= ShortChainsGreedy);
+      // lazy evaluation: if there is a match, then try running match finder on
+      // next position, too, but not after that
+      const bool isLazy = !isGreedy && (maxChainLength <= ShortChainsLazy);
+      // skip match finding on the next x bytes in greedy mode
+      Length skipMatches = 0;
+      // allow match finding on the next byte but skip afterwards (in lazy mode)
+      bool lazyEvaluation = false;
+
+      // the last literals of the previous block skipped matching, so they are
+      // missing from the hash chains
+      int64_t lookback = int64_t(dataZero);
+      if (lookback > BlockEndNoMatch && !parseDictionary)
+        lookback = BlockEndNoMatch;
+      if (parseDictionary) lookback = int64_t(dictionary.size());
+      // so let's go back a few bytes
+      lookback = -lookback;
+      // ... but not in legacy mode
+      if (useLegacyFormat || uncompressed) lookback = 0;
+
+      std::vector<Match> matches(uncompressed ? 0 : blockSize);
+      // find longest matches for each position (skip if level=0 which means
+      // "uncompressed")
+      int64_t i;
+      for (i = lookback;
+           i + BlockEndNoMatch <= int64_t(blockSize) && !uncompressed; i++) {
+        // detect self-matching
+        if (i > 0 && dataBlock[i] == dataBlock[i - 1]) {
+          Match prevMatch = matches[i - 1];
+          // predecessor had the same match ?
+          if (prevMatch.distance == 1 &&
+              prevMatch.length > MaxSameLetter)  // TODO: handle very long
+                                                 // self-referencing matches
+          {
+            // just copy predecessor without further (expensive) optimizations
+            matches[i].distance = 1;
+            matches[i].length = prevMatch.length - 1;
+            continue;
+          }
+        }
+
+        // read next four bytes
+        const uint32_t four = *(uint32_t*)(dataBlock + i);
+        // convert to a shorter hash
+        const uint32_t hash = getHash32(four);
+
+        // get most recent position of this hash
+        uint64_t lastHashMatch = lastHash[hash];
+        // and store current position
+        lastHash[hash] = i + lastBlock;
+
+        // remember: i could be negative, too
+        Distance prevIndex =
+            (i + MaxDistance + 1) &
+            MaxDistance;  // actually the same as i & MaxDistance
+
+        // no predecessor / no hash chain available ?
+        if (lastHashMatch == NoLastHash) {
+          previousHash[prevIndex] = EndOfChain;
+          previousExact[prevIndex] = EndOfChain;
+          continue;
+        }
+
+        // most recent hash match too far away ?
+        uint64_t distance = lastHash[hash] - lastHashMatch;
+        if (distance > MaxDistance) {
+          previousHash[prevIndex] = EndOfChain;
+          previousExact[prevIndex] = EndOfChain;
+          continue;
+        }
+
+        // build hash chain, i.e. store distance to last pseudo-match
+        previousHash[prevIndex] = (Distance)distance;
+
+        // skip pseudo-matches (hash collisions) and build a second chain where
+        // the first four bytes must match exactly
+        uint32_t currentFour;
+        // check the hash chain
+        while (true) {
+          // read four bytes
+          currentFour =
+              *(uint32_t*)(&data[lastHashMatch -
+                                 dataZero]);  // match may be found in the
+                                              // previous block, too
+          // match chain found, first 4 bytes are identical
+          if (currentFour == four) break;
+
+          // prevent from accidently hopping on an old, wrong hash chain
+          if (hash != getHash32(currentFour)) break;
+
+          // try next pseudo-match
+          Distance next = previousHash[lastHashMatch & MaxDistance];
+          // end of the hash chain ?
+          if (next == EndOfChain) break;
+
+          // too far away ?
+          distance += next;
+          if (distance > MaxDistance) break;
+
+          // take another step along the hash chain ...
+          lastHashMatch -= next;
+          // closest match is out of range ?
+          if (lastHashMatch < dataZero) break;
+        }
+
+        // search aborted / failed ?
+        if (four != currentFour) {
+          // no matches for the first four bytes
+          previousExact[prevIndex] = EndOfChain;
+          continue;
+        }
+
+        // store distance to previous match
+        previousExact[prevIndex] = (Distance)distance;
+
+        // no matching if crossing block boundary, just update hash tables
+        if (i < 0) continue;
+
+        // skip match finding if in greedy mode
+        if (skipMatches > 0) {
+          skipMatches--;
+          if (!lazyEvaluation) continue;
+          lazyEvaluation = false;
+        }
+
+        // and after all that preparation ... finally look for the longest match
+        matches[i] = findLongestMatch(data.data(), i + lastBlock, dataZero,
+                                      nextBlock - BlockEndLiterals,
+                                      previousExact.data());
+
+        // no match finding needed for the next few bytes in greedy/lazy mode
+        if ((isLazy || isGreedy) && matches[i].length != JustLiteral) {
+          lazyEvaluation = (skipMatches == 0);
+          skipMatches = matches[i].length;
+        }
+      }
+      // last bytes are always literals
+      while (i < int(matches.size())) matches[i++].length = JustLiteral;
+
+      // dictionary is valid only to the first block
+      parseDictionary = false;
+
+      // ==================== estimate costs (number of compressed bytes)
+      // ====================
+
+      // not needed in greedy mode and/or very short blocks
+      if (matches.size() > BlockEndNoMatch &&
+          maxChainLength > ShortChainsGreedy)
+        estimateCosts(matches);
+
+      // ==================== select best matches ====================
+
+      std::vector<unsigned char> compressed =
+          selectBestMatches(matches, &data[lastBlock - dataZero]);
+
+      // ==================== output ====================
+
+      // did compression do harm ?
+      bool useCompression = compressed.size() < blockSize && !uncompressed;
+      // legacy format is always compressed
+      useCompression |= useLegacyFormat;
+
+      // block size
+      uint32_t numBytes =
+          uint32_t(useCompression ? compressed.size() : blockSize);
+      uint32_t numBytesTagged = numBytes | (useCompression ? 0 : 0x80000000);
+      unsigned char num1 = numBytesTagged & 0xFF;
+      sendBytes(&num1, 1, userPtr);
+      unsigned char num2 = (numBytesTagged >> 8) & 0xFF;
+      sendBytes(&num2, 1, userPtr);
+      unsigned char num3 = (numBytesTagged >> 16) & 0xFF;
+      sendBytes(&num3, 1, userPtr);
+      unsigned char num4 = (numBytesTagged >> 24) & 0xFF;
+      sendBytes(&num4, 1, userPtr);
+
+      if (useCompression)
+        sendBytes(compressed.data(), numBytes, userPtr);
+      else  // uncompressed ? => copy input data
+        sendBytes(&data[lastBlock - dataZero], numBytes, userPtr);
+
+      // legacy format: no matching across blocks
+      if (useLegacyFormat) {
+        dataZero += data.size();
+        data.clear();
+
+        // clear hash tables
+        for (size_t i = 0; i < previousHash.size(); i++)
+          previousHash[i] = EndOfChain;
+        for (size_t i = 0; i < previousExact.size(); i++)
+          previousExact[i] = EndOfChain;
+        for (size_t i = 0; i < lastHash.size(); i++) lastHash[i] = NoLastHash;
+      } else {
+        // remove already processed data except for the last 64kb which could be
+        // used for intra-block matches
+        if (data.size() > MaxDistance) {
+          size_t remove = data.size() - MaxDistance;
+          dataZero += remove;
+          data.erase(data.begin(), data.begin() + remove);
+        }
+      }
+    }
+
+    // add an empty block
+    if (!useLegacyFormat) {
+      static const uint32_t zero = 0;
+      sendBytes(&zero, 4, userPtr);
+    }
+  }
+};
+
+#endif /* COSMOPOLITAN_THIRD_PARTY_SMALLZ4_SMALLZ4_H_ */