mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-02-07 23:13:34 +00:00
f531acc8f9
- Invent openatemp() API - Invent O_UNLINK open flag - Introduce getenv_secure() API - Remove `git pull` from cosmocc - Fix utimes() when path is NULL - Fix mktemp() to never return NULL - Fix utimensat() UTIME_OMIT on XNU - Improve utimensat() code for RHEL5 - Turn `argv[0]` C:/ to /C/ on Windows - Introduce tmpnam() and tmpnam_r() APIs - Fix more const issues with internal APIs - Permit utimes() on WIN32 in O_RDONLY mode - Fix fdopendir() to check fd is a directory - Fix recent crash regression in landlock make - Fix futimens(AT_FDCWD, NULL) to return EBADF - Use workaround so `make -j` doesn't fork bomb - Rename dontdiscard to __wur (just like glibc) - Fix st_size for WIN32 symlinks containing UTF-8 - Introduce stdio ext APIs needed by GNU coreutils - Fix lstat() on WIN32 for symlinks to directories - Move some constants from normalize.inc to limits.h - Fix segv with memchr() and memcmp() overlapping page - Implement POSIX fflush() behavior for reader streams - Implement AT_SYMLINK_NOFOLLOW for utimensat() on WIN32 - Don't change read-only status of existing files on WIN32 - Correctly handle `0x[^[:xdigit:]]` case in strtol() functions
189 lines
6 KiB
C
189 lines
6 KiB
C
// clang-format off
|
|
/*
|
|
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
* in the COPYING file in the root directory of this source tree).
|
|
* You may select, at your option, one of the above-listed licenses.
|
|
*/
|
|
|
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
|
# define ZDICT_STATIC_LINKING_ONLY
|
|
#endif
|
|
|
|
#include "libc/calls/calls.h"
|
|
#include "libc/calls/weirdtypes.h"
|
|
#include "libc/fmt/fmt.h"
|
|
#include "libc/stdio/dprintf.h"
|
|
#include "libc/stdio/stdio.h"
|
|
#include "libc/temp.h"
|
|
#include "third_party/musl/tempnam.h" /* fprintf */
|
|
#include "libc/calls/calls.h"
|
|
#include "libc/calls/termios.h"
|
|
#include "libc/fmt/conv.h"
|
|
#include "libc/limits.h"
|
|
#include "libc/mem/alg.h"
|
|
#include "libc/mem/alloca.h"
|
|
#include "libc/mem/mem.h"
|
|
#include "libc/runtime/runtime.h"
|
|
#include "libc/stdio/dprintf.h"
|
|
#include "libc/stdio/rand.h"
|
|
#include "libc/temp.h"
|
|
#include "libc/str/str.h"
|
|
#include "libc/sysv/consts/exit.h"
|
|
#include "third_party/musl/crypt.h"
|
|
#include "third_party/musl/rand48.h" /* malloc, free, qsort */
|
|
#include "libc/mem/alg.h"
|
|
#include "libc/mem/mem.h"
|
|
#include "libc/str/str.h" /* memset */
|
|
#include "libc/calls/calls.h"
|
|
#include "libc/calls/struct/timespec.h"
|
|
#include "libc/calls/struct/timeval.h"
|
|
#include "libc/calls/weirdtypes.h"
|
|
#include "libc/sysv/consts/clock.h"
|
|
#include "libc/sysv/consts/sched.h"
|
|
#include "libc/sysv/consts/timer.h"
|
|
#include "libc/time/struct/tm.h"
|
|
#include "libc/time/time.h" /* clock */
|
|
#include "third_party/zstd/lib/common/mem.h" /* read */
|
|
#include "third_party/zstd/lib/common/pool.h"
|
|
#include "third_party/zstd/lib/common/threading.h"
|
|
#include "third_party/zstd/lib/common/zstd_internal.h" /* includes zstd.h */
|
|
#include "third_party/zstd/zdict.h"
|
|
|
|
/**
|
|
* COVER_best_t is used for two purposes:
|
|
* 1. Synchronizing threads.
|
|
* 2. Saving the best parameters and dictionary.
|
|
*
|
|
* All of the methods except COVER_best_init() are thread safe if zstd is
|
|
* compiled with multithreaded support.
|
|
*/
|
|
typedef struct COVER_best_s {
|
|
ZSTD_pthread_mutex_t mutex;
|
|
ZSTD_pthread_cond_t cond;
|
|
size_t liveJobs;
|
|
void *dict;
|
|
size_t dictSize;
|
|
ZDICT_cover_params_t parameters;
|
|
size_t compressedSize;
|
|
} COVER_best_t;
|
|
|
|
/**
|
|
* A segment is a range in the source as well as the score of the segment.
|
|
*/
|
|
typedef struct {
|
|
U32 begin;
|
|
U32 end;
|
|
U32 score;
|
|
} COVER_segment_t;
|
|
|
|
/**
|
|
*Number of epochs and size of each epoch.
|
|
*/
|
|
typedef struct {
|
|
U32 num;
|
|
U32 size;
|
|
} COVER_epoch_info_t;
|
|
|
|
/**
|
|
* Struct used for the dictionary selection function.
|
|
*/
|
|
typedef struct COVER_dictSelection {
|
|
BYTE* dictContent;
|
|
size_t dictSize;
|
|
size_t totalCompressedSize;
|
|
} COVER_dictSelection_t;
|
|
|
|
/**
|
|
* Computes the number of epochs and the size of each epoch.
|
|
* We will make sure that each epoch gets at least 10 * k bytes.
|
|
*
|
|
* The COVER algorithms divide the data up into epochs of equal size and
|
|
* select one segment from each epoch.
|
|
*
|
|
* @param maxDictSize The maximum allowed dictionary size.
|
|
* @param nbDmers The number of dmers we are training on.
|
|
* @param k The parameter k (segment size).
|
|
* @param passes The target number of passes over the dmer corpus.
|
|
* More passes means a better dictionary.
|
|
*/
|
|
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
|
|
U32 k, U32 passes);
|
|
|
|
/**
|
|
* Warns the user when their corpus is too small.
|
|
*/
|
|
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
|
|
|
|
/**
|
|
* Checks total compressed size of a dictionary
|
|
*/
|
|
size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
|
|
const size_t *samplesSizes, const BYTE *samples,
|
|
size_t *offsets,
|
|
size_t nbTrainSamples, size_t nbSamples,
|
|
BYTE *const dict, size_t dictBufferCapacity);
|
|
|
|
/**
|
|
* Returns the sum of the sample sizes.
|
|
*/
|
|
size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
|
|
|
|
/**
|
|
* Initialize the `COVER_best_t`.
|
|
*/
|
|
void COVER_best_init(COVER_best_t *best);
|
|
|
|
/**
|
|
* Wait until liveJobs == 0.
|
|
*/
|
|
void COVER_best_wait(COVER_best_t *best);
|
|
|
|
/**
|
|
* Call COVER_best_wait() and then destroy the COVER_best_t.
|
|
*/
|
|
void COVER_best_destroy(COVER_best_t *best);
|
|
|
|
/**
|
|
* Called when a thread is about to be launched.
|
|
* Increments liveJobs.
|
|
*/
|
|
void COVER_best_start(COVER_best_t *best);
|
|
|
|
/**
|
|
* Called when a thread finishes executing, both on error or success.
|
|
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
|
* If this dictionary is the best so far save it and its parameters.
|
|
*/
|
|
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
|
|
COVER_dictSelection_t selection);
|
|
/**
|
|
* Error function for COVER_selectDict function. Checks if the return
|
|
* value is an error.
|
|
*/
|
|
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
|
|
|
|
/**
|
|
* Error function for COVER_selectDict function. Returns a struct where
|
|
* return.totalCompressedSize is a ZSTD error.
|
|
*/
|
|
COVER_dictSelection_t COVER_dictSelectionError(size_t error);
|
|
|
|
/**
|
|
* Always call after selectDict is called to free up used memory from
|
|
* newly created dictionary.
|
|
*/
|
|
void COVER_dictSelectionFree(COVER_dictSelection_t selection);
|
|
|
|
/**
|
|
* Called to finalize the dictionary and select one based on whether or not
|
|
* the shrink-dict flag was enabled. If enabled the dictionary used is the
|
|
* smallest dictionary within a specified regression of the compressed size
|
|
* from the largest dictionary.
|
|
*/
|
|
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
|
|
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
|
|
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
|