mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 11:37:35 +00:00
868af3f950
You can now use the hardest fastest and most dangerous language there is with Cosmopolitan. So far about 75% of LLVM libcxx has been added. A few breaking changes needed to be made to help this go smoothly. - Rename nothrow to dontthrow - Rename nodiscard to dontdiscard - Add some libm functions, e.g. lgamma, nan, etc. - Change intmax_t from int128 to int64 like everything else - Introduce %jjd formatting directive for int128_t - Introduce strtoi128(), strtou128(), etc. - Rename bsrmax() to bsr128() Some of the templates that should be working currently are std::vector, std::string, std::map, std::set, std::deque, etc.
807 lines
30 KiB
C++
807 lines
30 KiB
C++
#ifndef COSMOPOLITAN_THIRD_PARTY_SMALLZ4_SMALLZ4_H_
|
|
#define COSMOPOLITAN_THIRD_PARTY_SMALLZ4_SMALLZ4_H_
|
|
#include "third_party/libcxx/vector"
|
|
|
|
/**
|
|
* LZ4 compression with optimal parsing
|
|
*
|
|
* See smallz4.cc for a basic I/O interface you can easily replace it by
|
|
* a in-memory version then all you have to do is:
|
|
*
|
|
* smallz4::lz4(GET_BYTES, SEND_BYTES);
|
|
*
|
|
* For more advanced stuff, you can call lz4 with up to four parameters
|
|
* (incl. max chain length and a dictionary)
|
|
*/
|
|
class smallz4 {
|
|
public:
|
|
// read several bytes, see getBytesFromIn() in smallz4.cpp for a basic
|
|
// implementation
|
|
typedef size_t (*GET_BYTES)(void* data, size_t numBytes, void* userPtr);
|
|
// write several bytes, see sendBytesToOut() in smallz4.cpp for a basic
|
|
// implementation
|
|
typedef void (*SEND_BYTES)(const void* data, size_t numBytes, void* userPtr);
|
|
|
|
/// compress everything in input stream (accessed via getByte) and write to
|
|
/// output stream (via send)
|
|
static void lz4(GET_BYTES getBytes, SEND_BYTES sendBytes,
|
|
unsigned short maxChainLength = MaxChainLength,
|
|
bool useLegacyFormat = false, void* userPtr = NULL) {
|
|
lz4(getBytes, sendBytes, maxChainLength, std::vector<unsigned char>(),
|
|
useLegacyFormat, userPtr);
|
|
}
|
|
|
|
/// compress everything in input stream (accessed via getByte) and write to
|
|
/// output stream (via send)
|
|
static void lz4(
|
|
GET_BYTES getBytes, SEND_BYTES sendBytes, unsigned short maxChainLength,
|
|
const std::vector<unsigned char>& dictionary, // predefined dictionary
|
|
bool useLegacyFormat =
|
|
false, // old format is 7 bytes smaller if input < 8 MB
|
|
void* userPtr = NULL) {
|
|
smallz4 obj(maxChainLength);
|
|
obj.compress(getBytes, sendBytes, dictionary, useLegacyFormat, userPtr);
|
|
}
|
|
|
|
/// version string
|
|
static const char* const getVersion() {
|
|
return "1.5";
|
|
}
|
|
|
|
// compression level thresholds, made public because I display them in the
|
|
// help screen ...
|
|
enum {
|
|
/// greedy mode for short chains (compression level <= 3) instead of optimal
|
|
/// parsing / lazy evaluation
|
|
ShortChainsGreedy = 3,
|
|
/// lazy evaluation for medium-sized chains (compression level > 3 and <= 6)
|
|
ShortChainsLazy = 6
|
|
};
|
|
|
|
// ----- END OF PUBLIC INTERFACE -----
|
|
private:
|
|
// ----- constants and types -----
|
|
|
|
/// a block can be up to 4 MB, so uint32_t would suffice but uint64_t is quite
|
|
/// a bit faster on my x64 machine
|
|
typedef uint64_t Length;
|
|
/// matches must start within the most recent 64k
|
|
typedef uint16_t Distance;
|
|
|
|
enum {
|
|
/// each match's length must be >= 4
|
|
MinMatch = 4,
|
|
/// a literal needs one byte
|
|
JustLiteral = 1,
|
|
/// last match must not be closer than 12 bytes to the end
|
|
BlockEndNoMatch = 12,
|
|
/// last 5 bytes must be literals, no matching allowed
|
|
BlockEndLiterals = 5,
|
|
|
|
/// match finder's hash table size (2^HashBits entries, must be less than
|
|
/// 32)
|
|
HashBits = 20,
|
|
HashSize = 1 << HashBits,
|
|
|
|
/// input buffer size, can be any number but zero ;-)
|
|
BufferSize = 1024,
|
|
|
|
/// maximum match distance, must be power of 2 minus 1
|
|
MaxDistance = 65535,
|
|
/// marker for "no match"
|
|
EndOfChain = 0,
|
|
/// stop match finding after MaxChainLength steps (default is unlimited =>
|
|
/// optimal parsing)
|
|
MaxChainLength = MaxDistance,
|
|
|
|
/// significantly speed up parsing if the same byte is repeated a lot, may
|
|
/// cause sub-optimal compression
|
|
MaxSameLetter = 19 + 255 * 256, // was: 19 + 255,
|
|
|
|
/// maximum block size as defined in LZ4 spec: {
|
|
/// 0,0,0,0,64*1024,256*1024,1024*1024,4*1024*1024 } I only work with the
|
|
/// biggest maximum block size (7)
|
|
// note: xxhash header checksum is precalculated only for 7, too
|
|
MaxBlockSizeId = 7,
|
|
MaxBlockSize = 4 * 1024 * 1024,
|
|
|
|
/// legacy format has a fixed block size of 8 MB
|
|
MaxBlockSizeLegacy = 8 * 1024 * 1024,
|
|
|
|
/// number of literals and match length is encoded in several bytes, max.
|
|
/// 255 per byte
|
|
MaxLengthCode = 255
|
|
};
|
|
|
|
// ----- one and only variable ... -----
|
|
|
|
/// how many matches are checked in findLongestMatch, lower values yield
|
|
/// faster encoding at the cost of worse compression ratio
|
|
unsigned short maxChainLength;
|
|
|
|
// ----- code -----
|
|
|
|
/// match
|
|
struct Match {
|
|
/// length of match
|
|
Length length;
|
|
/// start of match
|
|
Distance distance;
|
|
};
|
|
|
|
/// create new compressor (only invoked by lz4)
|
|
explicit smallz4(unsigned short newMaxChainLength = MaxChainLength)
|
|
: maxChainLength(newMaxChainLength) // => no limit, but can be changed by
|
|
// setMaxChainLength
|
|
{
|
|
}
|
|
|
|
/// return true, if the four bytes at *a and *b match
|
|
inline static bool match4(const void* const a, const void* const b) {
|
|
return *(const uint32_t*)a == *(const uint32_t*)b;
|
|
}
|
|
|
|
/// simple hash function, input: 32 bits, output: HashBits bits (by default:
|
|
/// 20)
|
|
inline static uint32_t getHash32(uint32_t fourBytes) {
|
|
// taken from https://en.wikipedia.org/wiki/Linear_congruential_generator
|
|
const uint32_t HashMultiplier = 48271;
|
|
return ((fourBytes * HashMultiplier) >> (32 - HashBits)) & (HashSize - 1);
|
|
}
|
|
|
|
/// find longest match of data[pos] between data[begin] and data[end], use
|
|
/// match chain
|
|
Match findLongestMatch(const unsigned char* const data, uint64_t pos,
|
|
uint64_t begin, uint64_t end,
|
|
const Distance* const chain) const {
|
|
Match result;
|
|
result.length = JustLiteral; // assume a literal => one byte
|
|
|
|
// compression level: look only at the first n entries of the match chain
|
|
unsigned short stepsLeft = maxChainLength;
|
|
// findLongestMatch() shouldn't be called when maxChainLength = 0
|
|
// (uncompressed)
|
|
|
|
// pointer to position that is currently analyzed (which we try to find a
|
|
// great match for)
|
|
const unsigned char* const current = data + pos - begin;
|
|
// don't match beyond this point
|
|
const unsigned char* const stop = current + end - pos;
|
|
|
|
// get distance to previous match, abort if 0 => not existing
|
|
Distance distance = chain[pos & MaxDistance];
|
|
int64_t totalDistance = 0;
|
|
while (distance != EndOfChain) {
|
|
// chain goes too far back ?
|
|
totalDistance += distance;
|
|
if (totalDistance > MaxDistance) break; // can't match beyond 64k
|
|
|
|
// prepare next position
|
|
distance = chain[(pos - totalDistance) & MaxDistance];
|
|
|
|
// let's introduce a new pointer atLeast that points to the first "new"
|
|
// byte of a potential longer match
|
|
const unsigned char* const atLeast = current + result.length + 1;
|
|
// impossible to find a longer match because not enough bytes left ?
|
|
if (atLeast > stop) break;
|
|
|
|
// the idea is to split the comparison algorithm into 2 phases
|
|
// (1) scan backward from atLeast to current, abort if mismatch
|
|
// (2) scan forward until a mismatch is found and store length/distance
|
|
// of this new best match current atLeast
|
|
// | |
|
|
// -<<<<<<<< phase 1 <<<<<<<<
|
|
// >>> phase 2 >>>
|
|
// main reason for phase 1:
|
|
// - both byte sequences start with the same bytes, quite likely they are
|
|
// very similar
|
|
// - there is a good chance that if they differ, then their last bytes
|
|
// differ
|
|
// => checking the last first increases the probability that a mismatch is
|
|
// detected as early as possible
|
|
|
|
// compare 4 bytes at once
|
|
const Length CheckAtOnce = 4;
|
|
|
|
// all bytes between current and atLeast shall be identical
|
|
const unsigned char* phase1 =
|
|
atLeast - CheckAtOnce; // minus 4 because match4 checks 4 bytes
|
|
while (phase1 > current && match4(phase1, phase1 - totalDistance))
|
|
phase1 -= CheckAtOnce;
|
|
// note: - the first four bytes always match
|
|
// - in the last iteration, phase1 points either at current + 1 or
|
|
// current + 2 or current + 3
|
|
// - therefore we compare a few bytes twice => but a check to skip
|
|
// these checks is more expensive
|
|
|
|
// mismatch ? (the while-loop was aborted)
|
|
if (phase1 > current) continue;
|
|
|
|
// we have a new best match, now scan forward
|
|
const unsigned char* phase2 = atLeast;
|
|
|
|
// fast loop: check four bytes at once
|
|
while (phase2 + CheckAtOnce <= stop &&
|
|
match4(phase2, phase2 - totalDistance))
|
|
phase2 += CheckAtOnce;
|
|
// slow loop: check the last 1/2/3 bytes
|
|
while (phase2 < stop && *phase2 == *(phase2 - totalDistance)) phase2++;
|
|
|
|
// store new best match
|
|
result.distance = Distance(totalDistance);
|
|
result.length = Length(phase2 - current);
|
|
|
|
// stop searching on lower compression levels
|
|
if (--stepsLeft == 0) break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/// create shortest output
|
|
/** data points to block's begin; we need it to extract literals **/
|
|
static std::vector<unsigned char> selectBestMatches(
|
|
const std::vector<Match>& matches, const unsigned char* const data) {
|
|
// store encoded data
|
|
std::vector<unsigned char> result;
|
|
result.reserve(matches.size());
|
|
|
|
// indices of current run of literals
|
|
size_t literalsFrom = 0;
|
|
size_t numLiterals = 0;
|
|
|
|
bool lastToken = false;
|
|
|
|
// walk through the whole block
|
|
for (size_t offset = 0;
|
|
offset < matches.size();) // increment inside of loop
|
|
{
|
|
// get best cost-weighted match
|
|
Match match = matches[offset];
|
|
|
|
// if no match, then count literals instead
|
|
if (match.length <= JustLiteral) {
|
|
// first literal ? need to reset pointers of current sequence of
|
|
// literals
|
|
if (numLiterals == 0) literalsFrom = offset;
|
|
|
|
// add one more literal to current sequence
|
|
numLiterals++;
|
|
|
|
// next match
|
|
offset++;
|
|
|
|
// continue unless it's the last literal
|
|
if (offset < matches.size()) continue;
|
|
|
|
lastToken = true;
|
|
} else {
|
|
// skip unused matches
|
|
offset += match.length;
|
|
}
|
|
|
|
// store match length (4 is implied because it's the minimum match length)
|
|
int matchLength = int(match.length) - MinMatch;
|
|
|
|
// last token has zero length
|
|
if (lastToken) matchLength = 0;
|
|
|
|
// token consists of match length and number of literals, let's start with
|
|
// match length ...
|
|
unsigned char token =
|
|
(matchLength < 15) ? (unsigned char)matchLength : 15;
|
|
|
|
// >= 15 literals ? (extra bytes to store length)
|
|
if (numLiterals < 15) {
|
|
// add number of literals in higher four bits
|
|
token |= numLiterals << 4;
|
|
result.push_back(token);
|
|
} else {
|
|
// set all higher four bits, the following bytes with determine the
|
|
// exact number of literals
|
|
result.push_back(token | 0xF0);
|
|
|
|
// 15 is already encoded in token
|
|
int encodeNumLiterals = int(numLiterals) - 15;
|
|
|
|
// emit 255 until remainder is below 255
|
|
while (encodeNumLiterals >= MaxLengthCode) {
|
|
result.push_back(MaxLengthCode);
|
|
encodeNumLiterals -= MaxLengthCode;
|
|
}
|
|
// and the last byte (can be zero, too)
|
|
result.push_back((unsigned char)encodeNumLiterals);
|
|
}
|
|
// copy literals
|
|
if (numLiterals > 0) {
|
|
result.insert(result.end(), data + literalsFrom,
|
|
data + literalsFrom + numLiterals);
|
|
|
|
// last token doesn't have a match
|
|
if (lastToken) break;
|
|
|
|
// reset
|
|
numLiterals = 0;
|
|
}
|
|
|
|
// distance stored in 16 bits / little endian
|
|
result.push_back(match.distance & 0xFF);
|
|
result.push_back(match.distance >> 8);
|
|
|
|
// >= 15+4 bytes matched
|
|
if (matchLength >= 15) {
|
|
// 15 is already encoded in token
|
|
matchLength -= 15;
|
|
// emit 255 until remainder is below 255
|
|
while (matchLength >= MaxLengthCode) {
|
|
result.push_back(MaxLengthCode);
|
|
matchLength -= MaxLengthCode;
|
|
}
|
|
// and the last byte (can be zero, too)
|
|
result.push_back((unsigned char)matchLength);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/// walk backwards through all matches and compute number of compressed bytes
|
|
/// from current position to the end of the block
|
|
/** note: matches are modified (shortened length) if necessary **/
|
|
static void estimateCosts(std::vector<Match>& matches) {
|
|
const size_t blockEnd = matches.size();
|
|
|
|
// equals the number of bytes after compression
|
|
typedef uint32_t Cost;
|
|
// minimum cost from this position to the end of the current block
|
|
std::vector<Cost> cost(matches.size(), 0);
|
|
// "cost" represents the number of bytes needed
|
|
|
|
// the last bytes must always be literals
|
|
Length numLiterals = BlockEndLiterals;
|
|
// backwards optimal parsing
|
|
for (int64_t i = (int64_t)blockEnd - (1 + BlockEndLiterals); i >= 0;
|
|
i--) // ignore the last 5 bytes, they are always literals
|
|
{
|
|
// if encoded as a literal
|
|
numLiterals++;
|
|
Length bestLength = JustLiteral;
|
|
// such a literal "costs" 1 byte
|
|
Cost minCost = cost[i + 1] + JustLiteral;
|
|
|
|
// an extra length byte is required for every 255 literals
|
|
if (numLiterals >= 15) {
|
|
// same as: if ((numLiterals - 15) % MaxLengthCode == 0)
|
|
// but I try hard to avoid the slow modulo function
|
|
if (numLiterals == 15 || (numLiterals >= 15 + MaxLengthCode &&
|
|
(numLiterals - 15) % MaxLengthCode == 0))
|
|
minCost++;
|
|
}
|
|
|
|
// let's look at the longest match, almost always more efficient that the
|
|
// plain literals
|
|
Match match = matches[i];
|
|
|
|
// very long self-referencing matches can slow down the program A LOT
|
|
if (match.length >= MaxSameLetter && match.distance == 1) {
|
|
// assume that longest match is always the best match
|
|
// NOTE: this assumption might not be optimal !
|
|
bestLength = match.length;
|
|
minCost =
|
|
cost[i + match.length] + 1 + 2 + 1 + Cost(match.length - 19) / 255;
|
|
} else {
|
|
// this is the core optimization loop
|
|
|
|
// overhead of encoding a match: token (1 byte) + offset (2 bytes) +
|
|
// sometimes extra bytes for long matches
|
|
Cost extraCost = 1 + 2;
|
|
Length nextCostIncrease = 18; // need one more byte for 19+ long
|
|
// matches (next increase: 19+255*x)
|
|
|
|
// try all match lengths (start with short ones)
|
|
for (Length length = MinMatch; length <= match.length; length++) {
|
|
// token (1 byte) + offset (2 bytes) + extra bytes for long matches
|
|
Cost currentCost = cost[i + length] + extraCost;
|
|
// better choice ?
|
|
if (currentCost <= minCost) {
|
|
// regarding the if-condition:
|
|
// "<" prefers literals and shorter matches
|
|
// "<=" prefers longer matches
|
|
// they should produce the same number of bytes (because of the same
|
|
// cost)
|
|
// ... but every now and then it doesn't !
|
|
// that's why: too many consecutive literals require an extra length
|
|
// byte (which we took into consideration a few lines above) but we
|
|
// only looked at literals beyond the current position if there are
|
|
// many literal in front of the current position then it may be
|
|
// better to emit a match with the same cost as the literals at the
|
|
// current position
|
|
// => it "breaks" the long chain of literals and removes the extra
|
|
// length byte
|
|
minCost = currentCost;
|
|
bestLength = length;
|
|
// performance-wise, a long match is usually faster during decoding
|
|
// than multiple short matches on the other hand, literals are
|
|
// faster than short matches as well (assuming same cost)
|
|
}
|
|
|
|
// very long matches need extra bytes for encoding match length
|
|
if (length == nextCostIncrease) {
|
|
extraCost++;
|
|
nextCostIncrease += MaxLengthCode;
|
|
}
|
|
}
|
|
}
|
|
|
|
// store lowest cost so far
|
|
cost[i] = minCost;
|
|
|
|
// and adjust best match
|
|
matches[i].length = bestLength;
|
|
|
|
// reset number of literals if a match was chosen
|
|
if (bestLength != JustLiteral) numLiterals = 0;
|
|
|
|
// note: if bestLength is smaller than the previous matches[i].length then
|
|
// there might be a closer match
|
|
// which could be more cache-friendly (=> faster decoding)
|
|
}
|
|
}
|
|
|
|
/// compress everything in input stream (accessed via getByte) and write to
|
|
/// output stream (via send), improve compression with a predefined dictionary
|
|
void compress(GET_BYTES getBytes, SEND_BYTES sendBytes,
|
|
const std::vector<unsigned char>& dictionary,
|
|
bool useLegacyFormat, void* userPtr) const {
|
|
// ==================== write header ====================
|
|
if (useLegacyFormat) {
|
|
// magic bytes
|
|
const unsigned char header[] = {0x02, 0x21, 0x4C, 0x18};
|
|
sendBytes(header, sizeof(header), userPtr);
|
|
} else {
|
|
// frame header
|
|
const unsigned char header[] = {
|
|
0x04, 0x22, 0x4D,
|
|
0x18, // magic bytes
|
|
1 << 6, // flags: no checksums, blocks depend on each other and no
|
|
// dictionary ID
|
|
MaxBlockSizeId << 4, // max blocksize
|
|
0xDF // header checksum (precomputed)
|
|
};
|
|
sendBytes(header, sizeof(header), userPtr);
|
|
}
|
|
|
|
// ==================== declarations ====================
|
|
// change read buffer size as you like
|
|
unsigned char buffer[BufferSize];
|
|
|
|
// read the file in chunks/blocks, data will contain only bytes which are
|
|
// relevant for the current block
|
|
std::vector<unsigned char> data;
|
|
|
|
// file position corresponding to data[0]
|
|
size_t dataZero = 0;
|
|
// last already read position
|
|
size_t numRead = 0;
|
|
|
|
// passthru data ? (but still wrap it in LZ4 format)
|
|
const bool uncompressed = (maxChainLength == 0);
|
|
|
|
// last time we saw a hash
|
|
const uint64_t NoLastHash = ~0; // = -1
|
|
std::vector<uint64_t> lastHash(HashSize, NoLastHash);
|
|
|
|
// previous position which starts with the same bytes
|
|
std::vector<Distance> previousHash(
|
|
MaxDistance + 1,
|
|
Distance(EndOfChain)); // long chains based on my simple hash
|
|
std::vector<Distance> previousExact(
|
|
MaxDistance + 1,
|
|
Distance(EndOfChain)); // shorter chains based on exact matching of the
|
|
// first four bytes
|
|
// these two containers are essential for match finding:
|
|
// 1. I compute a hash of four byte
|
|
// 2. in lastHash is the location of the most recent block of four byte with
|
|
// that same hash
|
|
// 3. due to hash collisions, several groups of four bytes may yield the
|
|
// same hash
|
|
// 4. so for each location I can look up the previous location of the same
|
|
// hash in previousHash
|
|
// 5. basically it's a chain of memory locations where potential matches
|
|
// start
|
|
// 5. I follow this hash chain until I find exactly the same four bytes I
|
|
// was looking for
|
|
// 6. then I switch to a sparser chain: previousExact
|
|
// 7. it's basically the same idea as previousHash but this time not the
|
|
// hash but the first four bytes must be identical
|
|
// 8. previousExact will be used by findLongestMatch: it compare all such
|
|
// strings a figures out which is the longest match
|
|
|
|
// And why do I have to do it in such a complicated way ?
|
|
// - well, there are 2^32 combinations of four bytes
|
|
// - so that there are 2^32 potential chains
|
|
// - most combinations just don't occur and occupy no space but I still have
|
|
// to keep their "entry point" (which are empty/invalid)
|
|
// - that would be at least 16 GBytes RAM (2^32 x 4 bytes)
|
|
// - my hashing algorithm reduces the 2^32 combinations to 2^20 hashes (see
|
|
// hashBits), that's about 8 MBytes RAM
|
|
// - thus only 2^20 entry points and at most 2^20 hash chains which is
|
|
// easily manageable
|
|
// ... in the end it's all about conserving memory !
|
|
// (total memory consumption of smallz4 is about 64 MBytes)
|
|
|
|
// first and last offset of a block (nextBlock is end-of-block plus 1)
|
|
uint64_t lastBlock = 0;
|
|
uint64_t nextBlock = 0;
|
|
bool parseDictionary = !dictionary.empty();
|
|
|
|
// main loop, processes one block per iteration
|
|
while (true) {
|
|
// ==================== start new block ====================
|
|
// first byte of the currently processed block (std::vector data may
|
|
// contain the last 64k of the previous block, too)
|
|
const unsigned char* dataBlock = NULL;
|
|
|
|
// prepend dictionary
|
|
if (parseDictionary) {
|
|
// resize dictionary to 64k (minus 1 because we can only match the last
|
|
// 65535 bytes of the dictionary => MaxDistance)
|
|
if (dictionary.size() < MaxDistance) {
|
|
// dictionary is smaller than 64k, prepend garbage data
|
|
size_t unused = MaxDistance - dictionary.size();
|
|
data.resize(unused, 0);
|
|
data.insert(data.end(), dictionary.begin(), dictionary.end());
|
|
} else
|
|
// copy only the most recent 64k of the dictionary
|
|
data.insert(data.end(),
|
|
dictionary.begin() + dictionary.size() - MaxDistance,
|
|
dictionary.end());
|
|
|
|
nextBlock = data.size();
|
|
numRead = data.size();
|
|
}
|
|
|
|
// read more bytes from input
|
|
size_t maxBlockSize = useLegacyFormat ? MaxBlockSizeLegacy : MaxBlockSize;
|
|
while (numRead - nextBlock < maxBlockSize) {
|
|
// buffer can be significantly smaller than MaxBlockSize, that's the
|
|
// only reason for this while-block
|
|
size_t incoming = getBytes(buffer, BufferSize, userPtr);
|
|
// no more data ?
|
|
if (incoming == 0) break;
|
|
|
|
// add bytes to buffer
|
|
numRead += incoming;
|
|
data.insert(data.end(), buffer, buffer + incoming);
|
|
}
|
|
|
|
// no more data ? => WE'RE DONE !
|
|
if (nextBlock == numRead) break;
|
|
|
|
// determine block borders
|
|
lastBlock = nextBlock;
|
|
nextBlock += maxBlockSize;
|
|
// not beyond end-of-file
|
|
if (nextBlock > numRead) nextBlock = numRead;
|
|
|
|
// pointer to first byte of the currently processed block (the std::vector
|
|
// container named data may contain the last 64k of the previous block,
|
|
// too)
|
|
dataBlock = &data[lastBlock - dataZero];
|
|
|
|
const uint64_t blockSize = nextBlock - lastBlock;
|
|
|
|
// ==================== full match finder ====================
|
|
|
|
// greedy mode is much faster but produces larger output
|
|
const bool isGreedy = (maxChainLength <= ShortChainsGreedy);
|
|
// lazy evaluation: if there is a match, then try running match finder on
|
|
// next position, too, but not after that
|
|
const bool isLazy = !isGreedy && (maxChainLength <= ShortChainsLazy);
|
|
// skip match finding on the next x bytes in greedy mode
|
|
Length skipMatches = 0;
|
|
// allow match finding on the next byte but skip afterwards (in lazy mode)
|
|
bool lazyEvaluation = false;
|
|
|
|
// the last literals of the previous block skipped matching, so they are
|
|
// missing from the hash chains
|
|
int64_t lookback = int64_t(dataZero);
|
|
if (lookback > BlockEndNoMatch && !parseDictionary)
|
|
lookback = BlockEndNoMatch;
|
|
if (parseDictionary) lookback = int64_t(dictionary.size());
|
|
// so let's go back a few bytes
|
|
lookback = -lookback;
|
|
// ... but not in legacy mode
|
|
if (useLegacyFormat || uncompressed) lookback = 0;
|
|
|
|
std::vector<Match> matches(uncompressed ? 0 : blockSize);
|
|
// find longest matches for each position (skip if level=0 which means
|
|
// "uncompressed")
|
|
int64_t i;
|
|
for (i = lookback;
|
|
i + BlockEndNoMatch <= int64_t(blockSize) && !uncompressed; i++) {
|
|
// detect self-matching
|
|
if (i > 0 && dataBlock[i] == dataBlock[i - 1]) {
|
|
Match prevMatch = matches[i - 1];
|
|
// predecessor had the same match ?
|
|
if (prevMatch.distance == 1 &&
|
|
prevMatch.length > MaxSameLetter) // TODO: handle very long
|
|
// self-referencing matches
|
|
{
|
|
// just copy predecessor without further (expensive) optimizations
|
|
matches[i].distance = 1;
|
|
matches[i].length = prevMatch.length - 1;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// read next four bytes
|
|
const uint32_t four = *(uint32_t*)(dataBlock + i);
|
|
// convert to a shorter hash
|
|
const uint32_t hash = getHash32(four);
|
|
|
|
// get most recent position of this hash
|
|
uint64_t lastHashMatch = lastHash[hash];
|
|
// and store current position
|
|
lastHash[hash] = i + lastBlock;
|
|
|
|
// remember: i could be negative, too
|
|
Distance prevIndex =
|
|
(i + MaxDistance + 1) &
|
|
MaxDistance; // actually the same as i & MaxDistance
|
|
|
|
// no predecessor / no hash chain available ?
|
|
if (lastHashMatch == NoLastHash) {
|
|
previousHash[prevIndex] = EndOfChain;
|
|
previousExact[prevIndex] = EndOfChain;
|
|
continue;
|
|
}
|
|
|
|
// most recent hash match too far away ?
|
|
uint64_t distance = lastHash[hash] - lastHashMatch;
|
|
if (distance > MaxDistance) {
|
|
previousHash[prevIndex] = EndOfChain;
|
|
previousExact[prevIndex] = EndOfChain;
|
|
continue;
|
|
}
|
|
|
|
// build hash chain, i.e. store distance to last pseudo-match
|
|
previousHash[prevIndex] = (Distance)distance;
|
|
|
|
// skip pseudo-matches (hash collisions) and build a second chain where
|
|
// the first four bytes must match exactly
|
|
uint32_t currentFour;
|
|
// check the hash chain
|
|
while (true) {
|
|
// read four bytes
|
|
currentFour =
|
|
*(uint32_t*)(&data[lastHashMatch -
|
|
dataZero]); // match may be found in the
|
|
// previous block, too
|
|
// match chain found, first 4 bytes are identical
|
|
if (currentFour == four) break;
|
|
|
|
// prevent from accidently hopping on an old, wrong hash chain
|
|
if (hash != getHash32(currentFour)) break;
|
|
|
|
// try next pseudo-match
|
|
Distance next = previousHash[lastHashMatch & MaxDistance];
|
|
// end of the hash chain ?
|
|
if (next == EndOfChain) break;
|
|
|
|
// too far away ?
|
|
distance += next;
|
|
if (distance > MaxDistance) break;
|
|
|
|
// take another step along the hash chain ...
|
|
lastHashMatch -= next;
|
|
// closest match is out of range ?
|
|
if (lastHashMatch < dataZero) break;
|
|
}
|
|
|
|
// search aborted / failed ?
|
|
if (four != currentFour) {
|
|
// no matches for the first four bytes
|
|
previousExact[prevIndex] = EndOfChain;
|
|
continue;
|
|
}
|
|
|
|
// store distance to previous match
|
|
previousExact[prevIndex] = (Distance)distance;
|
|
|
|
// no matching if crossing block boundary, just update hash tables
|
|
if (i < 0) continue;
|
|
|
|
// skip match finding if in greedy mode
|
|
if (skipMatches > 0) {
|
|
skipMatches--;
|
|
if (!lazyEvaluation) continue;
|
|
lazyEvaluation = false;
|
|
}
|
|
|
|
// and after all that preparation ... finally look for the longest match
|
|
matches[i] = findLongestMatch(data.data(), i + lastBlock, dataZero,
|
|
nextBlock - BlockEndLiterals,
|
|
previousExact.data());
|
|
|
|
// no match finding needed for the next few bytes in greedy/lazy mode
|
|
if ((isLazy || isGreedy) && matches[i].length != JustLiteral) {
|
|
lazyEvaluation = (skipMatches == 0);
|
|
skipMatches = matches[i].length;
|
|
}
|
|
}
|
|
// last bytes are always literals
|
|
while (i < int(matches.size())) matches[i++].length = JustLiteral;
|
|
|
|
// dictionary is valid only to the first block
|
|
parseDictionary = false;
|
|
|
|
// ==================== estimate costs (number of compressed bytes)
|
|
// ====================
|
|
|
|
// not needed in greedy mode and/or very short blocks
|
|
if (matches.size() > BlockEndNoMatch &&
|
|
maxChainLength > ShortChainsGreedy)
|
|
estimateCosts(matches);
|
|
|
|
// ==================== select best matches ====================
|
|
|
|
std::vector<unsigned char> compressed =
|
|
selectBestMatches(matches, &data[lastBlock - dataZero]);
|
|
|
|
// ==================== output ====================
|
|
|
|
// did compression do harm ?
|
|
bool useCompression = compressed.size() < blockSize && !uncompressed;
|
|
// legacy format is always compressed
|
|
useCompression |= useLegacyFormat;
|
|
|
|
// block size
|
|
uint32_t numBytes =
|
|
uint32_t(useCompression ? compressed.size() : blockSize);
|
|
uint32_t numBytesTagged = numBytes | (useCompression ? 0 : 0x80000000);
|
|
unsigned char num1 = numBytesTagged & 0xFF;
|
|
sendBytes(&num1, 1, userPtr);
|
|
unsigned char num2 = (numBytesTagged >> 8) & 0xFF;
|
|
sendBytes(&num2, 1, userPtr);
|
|
unsigned char num3 = (numBytesTagged >> 16) & 0xFF;
|
|
sendBytes(&num3, 1, userPtr);
|
|
unsigned char num4 = (numBytesTagged >> 24) & 0xFF;
|
|
sendBytes(&num4, 1, userPtr);
|
|
|
|
if (useCompression)
|
|
sendBytes(compressed.data(), numBytes, userPtr);
|
|
else // uncompressed ? => copy input data
|
|
sendBytes(&data[lastBlock - dataZero], numBytes, userPtr);
|
|
|
|
// legacy format: no matching across blocks
|
|
if (useLegacyFormat) {
|
|
dataZero += data.size();
|
|
data.clear();
|
|
|
|
// clear hash tables
|
|
for (size_t i = 0; i < previousHash.size(); i++)
|
|
previousHash[i] = EndOfChain;
|
|
for (size_t i = 0; i < previousExact.size(); i++)
|
|
previousExact[i] = EndOfChain;
|
|
for (size_t i = 0; i < lastHash.size(); i++) lastHash[i] = NoLastHash;
|
|
} else {
|
|
// remove already processed data except for the last 64kb which could be
|
|
// used for intra-block matches
|
|
if (data.size() > MaxDistance) {
|
|
size_t remove = data.size() - MaxDistance;
|
|
dataZero += remove;
|
|
data.erase(data.begin(), data.begin() + remove);
|
|
}
|
|
}
|
|
}
|
|
|
|
// add an empty block
|
|
if (!useLegacyFormat) {
|
|
static const uint32_t zero = 0;
|
|
sendBytes(&zero, 4, userPtr);
|
|
}
|
|
}
|
|
};
|
|
|
|
#endif /* COSMOPOLITAN_THIRD_PARTY_SMALLZ4_SMALLZ4_H_ */
|