cosmopolitan/third_party/smallz4/smallz4cat.c

359 lines
13 KiB
C
Raw Normal View History

/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8 -*-│
vi: set et ft=c ts=8 tw=8 fenc=utf-8 :vi
smallz4cat
Copyright (c) 2016-2019 Stephan Brumme. All rights reserved.
See https://create.stephan-brumme.com/smallz4/ │
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/calls/calls.h"
#include "libc/mem/mem.h"
#include "libc/runtime/gc.internal.h"
#include "libc/runtime/runtime.h"
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
/**
* @fileoverview shorter, more readable, albeit slower re-implementation
* of lz4cat ( https://github.com/Cyan4973/xxHash )
*
* Limitations:
*
* - Skippable frames and legacy frames are not implemented (and most
* likely never will)
*
* - Checksums are not verified (see https://create.stephan-brumme.com/xxhash/
* for a simple implementation)
*
* Replace getByteFromIn() and sendToOut() by your own code if you need
* in-memory LZ4 decompression. Corrupted data causes a call to
* unlz4error().
*/
#define HISTORY_SIZE 65536 // don't change
#define READ_BUFFER_SIZE 1024 // change at will
static void unlz4error(const char* msg) {
fputs("ERROR: ", stderr);
fputs(msg, stderr);
fputc('\n', stderr);
exit(1);
}
typedef unsigned char (*GET_BYTE)(void*);
typedef void (*SEND_BYTES)(const unsigned char*, unsigned int, void*);
struct UserPtr {
// file handles
FILE* in;
FILE* out;
unsigned char readBuffer[READ_BUFFER_SIZE];
unsigned int pos;
unsigned int available;
};
/// read a single byte (with simple buffering)
static unsigned char getByteFromIn(void* userPtr) {
struct UserPtr* user = (struct UserPtr*)userPtr;
if (user->pos == user->available) {
user->pos = 0;
user->available = fread(user->readBuffer, 1, READ_BUFFER_SIZE, user->in);
if (user->available == 0) unlz4error("out of data");
}
return user->readBuffer[user->pos++];
}
/// write a block of bytes
static void sendBytesToOut(const unsigned char* data, unsigned int numBytes,
void* userPtr) {
/// cast user-specific data
struct UserPtr* user = (struct UserPtr*)userPtr;
if (data != NULL && numBytes > 0) fwrite(data, 1, numBytes, user->out);
}
/// decompress everything in input stream (accessed via getByte) and write to
/// output stream (via sendBytes)
void unlz4_userPtr(GET_BYTE getByte, SEND_BYTES sendBytes,
const char* dictionary, void* userPtr) {
// signature
unsigned char signature1 = getByte(userPtr);
unsigned char signature2 = getByte(userPtr);
unsigned char signature3 = getByte(userPtr);
unsigned char signature4 = getByte(userPtr);
unsigned int signature =
(signature4 << 24) | (signature3 << 16) | (signature2 << 8) | signature1;
unsigned char isModern = (signature == 0x184D2204);
unsigned char isLegacy = (signature == 0x184C2102);
if (!isModern && !isLegacy) unlz4error("invalid signature");
unsigned char hasBlockChecksum = false;
unsigned char hasContentSize = false;
unsigned char hasContentChecksum = false;
unsigned char hasDictionaryID = false;
if (isModern) {
// flags
unsigned char flags = getByte(userPtr);
hasBlockChecksum = flags & 16;
hasContentSize = flags & 8;
hasContentChecksum = flags & 4;
hasDictionaryID = flags & 1;
// only version 1 file format
unsigned char version = flags >> 6;
if (version != 1) unlz4error("only LZ4 file format version 1 supported");
// ignore blocksize
char numIgnore = 1;
// ignore, skip 8 bytes
if (hasContentSize) numIgnore += 8;
// ignore, skip 4 bytes
if (hasDictionaryID) numIgnore += 4;
// ignore header checksum (xxhash32 of everything up this point & 0xFF)
numIgnore++;
// skip all those ignored bytes
while (numIgnore--) getByte(userPtr);
}
// contains the latest decoded data
unsigned char* history = gc(malloc(HISTORY_SIZE));
// next free position in history[]
unsigned int pos = 0;
// dictionary compression is a recently introduced feature, just move its
// contents to the buffer
if (dictionary != NULL) {
// open dictionary
FILE* dict = fopen(dictionary, "rb");
if (!dict) unlz4error("cannot open dictionary");
// get dictionary's filesize
fseek(dict, 0, SEEK_END);
long dictSize = ftell(dict);
// only the last 64k are relevant
long relevant = dictSize < 65536 ? 0 : dictSize - 65536;
fseek(dict, relevant, SEEK_SET);
if (dictSize > 65536) dictSize = 65536;
// read it and store it at the end of the buffer
fread(history + HISTORY_SIZE - dictSize, 1, dictSize, dict);
fclose(dict);
}
// parse all blocks until blockSize == 0
while (1) {
// block size
unsigned int blockSize = getByte(userPtr);
blockSize |= (unsigned int)getByte(userPtr) << 8;
blockSize |= (unsigned int)getByte(userPtr) << 16;
blockSize |= (unsigned int)getByte(userPtr) << 24;
// highest bit set ?
unsigned char isCompressed = isLegacy || (blockSize & 0x80000000) == 0;
if (isModern) blockSize &= 0x7FFFFFFF;
// stop after last block
if (blockSize == 0) break;
if (isCompressed) {
// decompress block
unsigned int blockOffset = 0;
unsigned int numWritten = 0;
while (blockOffset < blockSize) {
// get a token
unsigned char token = getByte(userPtr);
blockOffset++;
// determine number of literals
unsigned int numLiterals = token >> 4;
if (numLiterals == 15) {
// number of literals length encoded in more than 1 byte
unsigned char current;
do {
current = getByte(userPtr);
numLiterals += current;
blockOffset++;
} while (current == 255);
}
blockOffset += numLiterals;
// copy all those literals
if (pos + numLiterals < HISTORY_SIZE) {
// fast loop
while (numLiterals-- > 0) history[pos++] = getByte(userPtr);
} else {
// slow loop
while (numLiterals-- > 0) {
history[pos++] = getByte(userPtr);
// flush output buffer
if (pos == HISTORY_SIZE) {
sendBytes(history, HISTORY_SIZE, userPtr);
numWritten += HISTORY_SIZE;
pos = 0;
}
}
}
// last token has only literals
if (blockOffset == blockSize) break;
// match distance is encoded in two bytes (little endian)
unsigned int delta = getByte(userPtr);
delta |= (unsigned int)getByte(userPtr) << 8;
// zero isn't allowed
if (delta == 0) unlz4error("invalid offset");
blockOffset += 2;
// match length (always >= 4, therefore length is stored minus 4)
unsigned int matchLength = 4 + (token & 0x0F);
if (matchLength == 4 + 0x0F) {
unsigned char current;
do // match length encoded in more than 1 byte
{
current = getByte(userPtr);
matchLength += current;
blockOffset++;
} while (current == 255);
}
// copy match
unsigned int referencePos =
(pos >= delta) ? (pos - delta) : (HISTORY_SIZE + pos - delta);
// start and end within the current 64k block ?
if (pos + matchLength < HISTORY_SIZE &&
referencePos + matchLength < HISTORY_SIZE) {
// read/write continuous block (no wrap-around at the end of
// history[]) fast copy
if (pos >= referencePos + matchLength ||
referencePos >= pos + matchLength) {
// non-overlapping
memcpy(history + pos, history + referencePos, matchLength);
pos += matchLength;
} else {
// overlapping, slower byte-wise copy
while (matchLength-- > 0) history[pos++] = history[referencePos++];
}
} else {
// either read or write wraps around at the end of history[]
while (matchLength-- > 0) {
// copy single byte
history[pos++] = history[referencePos++];
// cannot write anymore ? => wrap around
if (pos == HISTORY_SIZE) {
// flush output buffer
sendBytes(history, HISTORY_SIZE, userPtr);
numWritten += HISTORY_SIZE;
pos = 0;
}
// wrap-around of read location
referencePos %= HISTORY_SIZE;
}
}
}
// all legacy blocks must be completely filled - except for the last one
if (isLegacy && numWritten + pos < 8 * 1024 * 1024) break;
} else {
// copy uncompressed data and add to history, too (if next block is
// compressed and some matches refer to this block)
while (blockSize-- > 0) {
// copy a byte ...
history[pos++] = getByte(userPtr);
// ... until buffer is full => send to output
if (pos == HISTORY_SIZE) {
sendBytes(history, HISTORY_SIZE, userPtr);
pos = 0;
}
}
}
if (hasBlockChecksum) {
// ignore checksum, skip 4 bytes
getByte(userPtr);
getByte(userPtr);
getByte(userPtr);
getByte(userPtr);
}
}
if (hasContentChecksum) {
// ignore checksum, skip 4 bytes
getByte(userPtr);
getByte(userPtr);
getByte(userPtr);
getByte(userPtr);
}
// flush output buffer
sendBytes(history, pos, userPtr);
}
/// old interface where getByte and sendBytes use global file handles
void unlz4(GET_BYTE getByte, SEND_BYTES sendBytes, const char* dictionary) {
unlz4_userPtr(getByte, sendBytes, dictionary, NULL);
}
/// parse command-line
int main(int argc, const char* argv[]) {
// default input/output streams
struct UserPtr user = {.in = stdin,
.out = stdout,
.pos = 0, // initial input buffer is empty
.available = 0};
const char* dictionary = NULL;
// first command-line parameter is our input filename / but ignore "-" which
// stands for STDIN
int parameter;
for (parameter = 1; parameter < argc; parameter++) {
const char* current = argv[parameter];
// dictionary
if (current[0] == '-' && current[1] == 'D') {
if (parameter + 1 >= argc) unlz4error("no dictionary filename found");
dictionary = argv[++parameter];
continue;
}
// filename
// read from STDIN, default behavior
if (current[0] != '-' && current[1] != '\0') {
// already have a filename - at most one filename is allowed (except for
// dictionary) ?
if (user.in != stdin)
unlz4error("can only decompress one file at a time");
// get handle
user.in = fopen(argv[1], "rb");
if (!user.in) unlz4error("file not found");
}
}
// and go !
unlz4_userPtr(getByteFromIn, sendBytesToOut, dictionary, &user);
return 0;
}