Introduce zstd into cosmopolitan (#843)

2025-09-11 03:03:48 +00:00 · 2023-07-01 20:30:38 -07:00 · 2023-07-01 20:30:38 -07:00 · 5f1e129222
commit 5f1e129222
parent c0348330a5
106 changed files with 64014 additions and 0 deletions
--- a/1
+++ b/1
@ -181,6 +181,7 @@ include third_party/maxmind/maxmind.mk
 include net/finger/finger.mk
 include third_party/double-conversion/test/test.mk
 include third_party/lua/lua.mk
+include third_party/zstd/zstd.mk
 include third_party/tr/tr.mk
 include third_party/sed/sed.mk
 include third_party/awk/awk.mk
--- a/third_party/third_party.mk
+++ b/third_party/third_party.mk
@ -39,4 +39,5 @@ o/$(MODE)/third_party:				\
 	o/$(MODE)/third_party/vqsort		\
 	o/$(MODE)/third_party/xed		\
 	o/$(MODE)/third_party/zip		\
+	o/$(MODE)/third_party/zstd		\
 	o/$(MODE)/third_party/zlib
--- a/third_party/zstd/CHANGELOG
+++ b/third_party/zstd/CHANGELOG
@ -0,0 +1,800 @@
+v1.5.5 (Apr 2023)
+fix: fix rare corruption bug affecting the high compression mode, reported by @danlark1 (#3517, @terrelln)
+perf: improve mid-level compression speed (#3529, #3533, #3543, @yoniko and #3552, @terrelln)
+lib: deprecated bufferless block-level API (#3534) by @terrelln
+cli: mmap large dictionaries to save memory, by @daniellerozenblit
+cli: improve speed of --patch-from mode (~+50%) (#3545) by @daniellerozenblit
+cli: improve i/o speed (~+10%) when processing lots of small files (#3479) by @felixhandte
+cli: zstd no longer crashes when requested to write into write-protected directory (#3541) by @felixhandte
+cli: fix decompression into block device using -o, reported by @georgmu (#3583)
+build: fix zstd CLI compiled with lzma support but not zlib support (#3494) by @Hello71
+build: fix cmake does no longer require 3.18 as minimum version (#3510) by @kou
+build: fix MSVC+ClangCL linking issue (#3569) by @tru
+build: fix zstd-dll, version of zstd CLI that links to the dynamic library (#3496) by @yoniko
+build: fix MSVC warnings (#3495) by @embg
+doc: updated zstd specification to clarify corner cases, by @Cyan4973
+doc: document how to create fat binaries for macos (#3568) by @rickmark
+misc: improve seekable format ingestion speed (~+100%) for very small chunk sizes (#3544) by @Cyan4973
+misc: tests/fullbench can benchmark multiple files (#3516) by @dloidolt
+
+v1.5.4 (Feb 2023)
+perf: +20% faster huffman decompression for targets that can't compile x64 assembly (#3449, @terrelln)
+perf: up to +10% faster streaming compression at levels 1-2 (#3114, @embg)
+perf: +4-13% for levels 5-12 by optimizing function generation (#3295, @terrelln)
+pref: +3-11% compression speed for `arm` target (#3199, #3164, #3145, #3141, #3138, @JunHe77 and #3139, #3160, @danlark1)
+perf: +5-30% faster dictionary compression at levels 1-4 (#3086, #3114, #3152, @embg)
+perf: +10-20% cold dict compression speed by prefetching CDict tables (#3177, @embg)
+perf: +1% faster compression by removing a branch in ZSTD_fast_noDict (#3129, @felixhandte)
+perf: Small compression ratio improvements in high compression mode (#2983, #3391, @Cyan4973 and #3285, #3302, @daniellerozenblit)
+perf: small speed improvement by better detecting `STATIC_BMI2` for `clang` (#3080, @TocarIP)
+perf: Improved streaming performance when `ZSTD_c_stableInBuffer` is set (#2974, @Cyan4973)
+cli: Asynchronous I/O for improved cli speed (#2975, #2985, #3021, #3022, @yoniko)
+cli: Change `zstdless` behavior to align with `zless` (#2909, @binhdvo)
+cli: Keep original file if `-c` or `--stdout` is given (#3052, @dirkmueller)
+cli: Keep original files when result is concatenated into a single output with `-o` (#3450, @Cyan4973)
+cli: Preserve Permissions and Ownership of regular files (#3432, @felixhandte)
+cli: Print zlib/lz4/lzma library versions with `-vv` (#3030, @terrelln)
+cli: Print checksum value for single frame files with `-lv`  (#3332, @Cyan4973)
+cli: Print `dictID` when present with `-lv` (#3184, @htnhan)
+cli: when `stderr` is *not* the console, disable status updates, but preserve final summary (#3458, @Cyan4973)
+cli: support `--best` and `--no-name` in `gzip` compatibility mode (#3059, @dirkmueller)
+cli: support for `posix` high resolution timer `clock_gettime()`, for improved benchmark accuracy (#3423, @Cyan4973)
+cli: improved help/usage (`-h`,  `-H`) formatting (#3094, @dirkmueller and #3385, @jonpalmisc)
+cli: Fix better handling of bogus numeric values (#3268, @ctkhanhly)
+cli: Fix input consists of multiple files _and_ `stdin` (#3222, @yoniko)
+cli: Fix tiny files passthrough (#3215, @cgbur)
+cli: Fix for `-r` on empty directory (#3027, @brailovich)
+cli: Fix empty string as argument for `--output-dir-*` (#3220, @embg)
+cli: Fix decompression memory usage reported by `-vv --long` (#3042, @u1f35c, and #3232, @zengyijing)
+cli: Fix infinite loop when empty input is passed to trainer (#3081, @terrelln)
+cli: Fix `--adapt` doesn't work when `--no-progress` is also set (#3354, @terrelln)
+api: Support for Block-Level Sequence Producer (#3333, @embg)
+api: Support for in-place decompression (#3432, @terrelln)
+api: New  `ZSTD_CCtx_setCParams()`  function, set all parameters defined in a  `ZSTD_compressionParameters`  structure (#3403, @Cyan4973)
+api: Streaming decompression detects incorrect header ID sooner (#3175, @Cyan4973)
+api: Window size resizing optimization for edge case (#3345, @daniellerozenblit)
+api: More accurate error codes for busy-loop scenarios (#3413, #3455, @Cyan4973)
+api: Fix limit overflow in `compressBound` and `decompressBound` (#3362, #3373, Cyan4973) reported by @nigeltao
+api: Deprecate several advanced experimental functions: streaming (#3408, @embg), copy (#3196, @mileshu)
+bug: Fix corruption that rarely occurs in 32-bit mode with wlog=25 (#3361, @terrelln)
+bug: Fix for block-splitter (#3033, @Cyan4973)
+bug: Fixes for Sequence Compression API (#3023, #3040, @Cyan4973)
+bug: Fix leaking thread handles on Windows (#3147, @animalize)
+bug: Fix timing issues with cmake/meson builds (#3166, #3167, #3170, @Cyan4973)
+build: Allow user to select legacy level for cmake (#3050, @shadchin)
+build: Enable legacy support by default in cmake (#3079, @niamster)
+build: Meson build script improvements (#3039, #3120, #3122, #3327, #3357, @eli-schwartz and #3276, @neheb)
+build: Add aarch64 to supported architectures for zstd_trace (#3054, @ooosssososos)
+build: support AIX architecture (#3219, @qiongsiwu)
+build: Fix `ZSTD_LIB_MINIFY` build macro, which now reduces static library size by half (#3366, @terrelln)
+build: Fix Windows issues with Multithreading translation layer (#3364, #3380, @yoniko) and ARM64 target (#3320, @cwoffenden)
+build: Fix `cmake` script (#3382, #3392, @terrelln and #3252 @Tachi107 and #3167 @Cyan4973)
+doc: Updated man page, providing more details for `--train` mode (#3112, @Cyan4973)
+doc: Add decompressor errata document (#3092, @terrelln)
+misc: Enable Intel CET (#2992, #2994, @hjl-tools)
+misc: Fix `contrib/` seekable format (#3058, @yhoogstrate and #3346, @daniellerozenblit)
+misc: Improve speed of the one-file library generator (#3241, @wahern and #3005, @cwoffenden)
+
+v1.5.3 (dev version, unpublished)
+
+v1.5.2 (Jan, 2022)
+perf: Regain Minimal memset()-ing During Reuse of Compression Contexts (@Cyan4973, #2969)
+build: Build Zstd with `noexecstack` on All Architectures (@felixhandte, #2964)
+doc: Clarify Licensing (@terrelln, #2981)
+
+v1.5.1 (Dec, 2021)
+perf: rebalanced compression levels, to better match the intended speed/level curve, by @senhuang42
+perf: faster huffman decoder, using x64 assembly, by @terrelln
+perf: slightly faster high speed modes (strategies fast & dfast), by @felixhandte
+perf: improved binary size and faster compilation times, by @terrelln
+perf: new row64 mode, used notably in level 12, by @senhuang42
+perf: faster mid-level compression speed in presence of highly repetitive patterns, by @senhuang42
+perf: minor compression ratio improvements for small data at high levels, by @cyan4973
+perf: reduced stack usage (mostly useful for Linux Kernel), by @terrelln
+perf: faster compression speed on incompressible data, by @bindhvo
+perf: on-demand reduced ZSTD_DCtx state size, using build macro ZSTD_DECODER_INTERNAL_BUFFER, at a small cost of performance, by @bindhvo
+build: allows hiding static symbols in the dynamic library, using build macro, by @skitt
+build: support for m68k (Motorola 68000's), by @cyan4973
+build: improved AIX support, by @Helflym
+build: improved meson unofficial build, by @eli-schwartz
+cli : custom memory limit when training dictionary (#2925), by @embg
+cli : report advanced parameters information when compressing in very verbose mode (``-vv`), by @Svetlitski-FB
+
+v1.5.0  (May 11, 2021)
+api: Various functions promoted from experimental to stable API: (#2579-2581, @senhuang42)
+  `ZSTD_defaultCLevel()`
+  `ZSTD_getDictID_fromCDict()`
+api: Several experimental functions have been deprecated and will emit a compiler warning (#2582, @senhuang42)
+  `ZSTD_compress_advanced()`
+  `ZSTD_compress_usingCDict_advanced()`
+  `ZSTD_compressBegin_advanced()`
+  `ZSTD_compressBegin_usingCDict_advanced()`
+  `ZSTD_initCStream_srcSize()`
+  `ZSTD_initCStream_usingDict()`
+  `ZSTD_initCStream_usingCDict()`
+  `ZSTD_initCStream_advanced()`
+  `ZSTD_initCStream_usingCDict_advanced()`
+  `ZSTD_resetCStream()`
+api: ZSTDMT_NBWORKERS_MAX reduced to 64 for 32-bit environments (@Cyan4973)
+perf: Significant speed improvements for middle compression levels (#2494, @senhuang42 @terrelln)
+perf: Block splitter to improve compression ratio, enabled by default for high compression levels (#2447, @senhuang42)
+perf: Decompression loop refactor, speed improvements on `clang` and for `--long` modes (#2614 #2630, @Cyan4973)
+perf: Reduced stack usage during compression and decompression entropy stage (#2522 #2524, @terrelln)
+bug: Improve setting permissions of created files (#2525, @felixhandte)
+bug: Fix large dictionary non-determinism (#2607, @terrelln)
+bug: Fix non-determinism test failures on Linux i686 (#2606, @terrelln)
+bug: Fix various dedicated dictionary search bugs (#2540 #2586, @senhuang42 @felixhandte)
+bug: Ensure `ZSTD_estimateCCtxSize*() `monotonically increases with compression level (#2538, @senhuang42)
+bug: Fix --patch-from mode parameter bound bug with small files (#2637, @occivink)
+bug: Fix UBSAN error in decompression (#2625, @terrelln)
+bug: Fix superblock compression divide by zero bug (#2592, @senhuang42)
+bug: Make the number of physical CPU cores detection more robust (#2517, @PaulBone)
+doc: Improve `zdict.h` dictionary training API documentation (#2622, @terrelln)
+doc: Note that public `ZSTD_free*()` functions accept NULL pointers (#2521, @animalize)
+doc: Add style guide docs for open source contributors (#2626, @Cyan4973)
+tests: Better regression test coverage for different dictionary modes (#2559, @senhuang42)
+tests: Better test coverage of index reduction (#2603, @terrelln)
+tests: OSS-Fuzz coverage for seekable format (#2617, @senhuang42)
+tests: Test coverage for ZSTD threadpool API (#2604, @senhuang42)
+build: Dynamic library built multithreaded by default (#2584, @senhuang42)
+build: Move  `zstd_errors.h`  and  `zdict.h`  to  `lib/`  root (#2597, @terrelln)
+build: Allow `ZSTDMT_JOBSIZE_MIN` to be configured at compile-time, reduce default to 512KB (#2611, @Cyan4973)
+build: Single file library build script moved to `build/` directory (#2618, @felixhandte)
+build: `ZBUFF_*()` is no longer built by default (#2583, @senhuang42)
+build: Fixed Meson build (#2548, @SupervisedThinking @kloczek)
+build: Fix excessive compiler warnings with clang-cl and CMake (#2600, @nickhutchinson)
+build: Detect presence of `md5` on Darwin (#2609, @felixhandte)
+build: Avoid SIGBUS on armv6 (#2633, @bmwiedmann)
+cli: `--progress` flag added to always display progress bar (#2595, @senhuang42)
+cli: Allow reading from block devices with `--force` (#2613, @felixhandte)
+cli: Fix CLI filesize display bug (#2550, @Cyan4973)
+cli: Fix windows CLI `--filelist` end-of-line bug (#2620, @Cyan4973)
+contrib: Various fixes for linux kernel patch (#2539, @terrelln)
+contrib: Seekable format - Decompression hanging edge case fix (#2516, @senhuang42)
+contrib: Seekable format - New seek table-only API  (#2113 #2518, @mdittmer @Cyan4973)
+contrib: Seekable format - Fix seek table descriptor check when loading (#2534, @foxeng)
+contrib: Seekable format - Decompression fix for large offsets, (#2594, @azat)
+misc: Automatically published release tarballs available on Github (#2535, @felixhandte)
+
+v1.4.9  (Mar 1, 2021)
+bug: Use `umask()` to Constrain Created File Permissions (#2495, @felixhandte)
+bug: Make Simple Single-Pass Functions Ignore Advanced Parameters (#2498, @terrelln)
+api: Add (De)Compression Tracing Functionality (#2482, @terrelln)
+api: Support References to Multiple DDicts (#2446, @senhuang42)
+api: Add Function to Generate Skippable Frame (#2439, @senhuang42)
+perf: New Algorithms for the Long Distance Matcher (#2483, @mpu)
+perf: Performance Improvements for Long Distance Matcher (#2464, @mpu)
+perf: Don't Shrink Window Log when Streaming with a Dictionary (#2451, @terrelln)
+cli: Fix `--output-dir-mirror`'s Rejection of `..`-Containing Paths (#2512, @felixhandte)
+cli: Allow Input From Console When `-f`/`--force` is Passed (#2466, @felixhandte)
+cli: Improve Help Message (#2500, @senhuang42)
+tests: Remove Flaky Tests (#2455, #2486, #2445, @Cyan4973)
+tests: Correctly Invoke md5 Utility on NetBSD (#2492, @niacat)
+tests: Avoid Using `stat -c` on NetBSD (#2513, @felixhandte)
+build: Zstd CLI Can Now be Linked to Dynamic `libzstd` (#2457, #2454 @Cyan4973)
+build: Hide and Avoid Using Static-Only Symbols (#2501, #2504, @skitt)
+build: CMake: Enable Only C for lib/ and programs/ Projects (#2498, @concatime)
+build: CMake: Use `configure_file()` to Create the `.pc` File (#2462, @lazka)
+build: Fix Fuzzer Compiler Detection & Update UBSAN Flags (#2503, @terrelln)
+build: Add Guards for `_LARGEFILE_SOURCE` and `_LARGEFILE64_SOURCE` (#2444, @indygreg)
+build: Improve `zlibwrapper` Makefile (#2437, @Cyan4973)
+contrib: Add `recover_directory` Program (#2473, @terrelln)
+doc: Change License Year to 2021 (#2452 & #2465, @terrelln & @senhuang42)
+doc: Fix Typos (#2459, @ThomasWaldmann)
+
+v1.4.8  (Dec 18, 2020)
+hotfix: wrong alignment of an internal buffer
+
+v1.4.7  (Dec 16, 2020)
+perf: stronger --long mode at high compression levels, by @senhuang42
+perf: stronger --patch-from at high compression levels, thanks to --long improvements
+perf: faster dictionary compression at medium compression levels, by @felixhandte
+perf: small speed & memory usage improvements for ZSTD_compress2(), by @terrelln
+perf: improved fast compression speeds with Visual Studio, by @animalize
+cli : Set nb of threads with environment variable ZSTD_NBTHREADS, by @senhuang42
+cli : accept decompressing files with *.zstd suffix
+cli : provide a condensed summary by default when processing multiple files
+cli : fix : stdin input no longer confused as user prompt
+cli : improve accuracy of several error messages
+api : new sequence ingestion API, by @senhuang42
+api : shared thread pool: control total nb of threads used by multiple compression jobs, by @marxin
+api : new ZSTD_getDictID_fromCDict(), by @LuAPi
+api : zlibWrapper only uses public API, and is compatible with dynamic library, by @terrelln
+api : fix : multithreaded compression has predictable output even in special cases (see #2327) (issue not accessible from cli)
+api : fix : dictionary compression correctly respects dictionary compression level (see #2303) (issue not accessible from cli)
+build: fix cmake script when using path with spaces, by @terrelln
+build: improved compile-time detection of aarch64/neon platforms, by @bsdimp
+build: Fix building on AIX 5.1, by @likema
+build: compile paramgrill with cmake on Windows, requested by @mirh
+doc : clarify repcode updates in format specification, by @felixhandte
+
+v1.4.6
+fix : Always return dstSize_tooSmall when that is the case
+fix : Fix ZSTD_initCStream_advanced() with static allocation and no dictionary
+perf: Improve small block decompression speed by 20%+, by @terrelln
+perf: Reduce compression stack usage by 1 KB, by @terrelln
+perf: Improve decompression speed by improving ZSTD_wildcopy, by @helloguo (#2252, #2256)
+perf: Improve histogram construction, by @cyan4973 (#2253)
+cli : Add --output-dir-mirror option, by @xxie24 (#2219)
+cli : Warn when (de)compressing multiple files into a single output, by @senhuang42 (#2279)
+cli : Improved progress bar and status summary when (de)compressing multiple files, by @senhuang42 (#2283)
+cli : Call stat less often, by @felixhandte (#2262)
+cli : Allow --patch-from XXX and --filelist XXX in addition to --patch-from=XXX and --filelist=XXX, by @cyan4973 (#2250)
+cli : Allow --patch-from to compress stdin with --stream-size, by @bimbashrestha (#2206)
+api : Do not install zbuff.h, since it has long been deprecated, by @cyan4973 (#2166).
+api : Fix ZSTD_CCtx_setParameter() with ZSTD_c_compressionLevel to make 0 mean default level, by @i-do-cpp (#2291)
+api : Rename ZSTDMT_NBTHREADS_MAX to ZSTDMT_NBWORKERS_MAX, by @marxin (#2228).
+build: Install pkg-config file with CMake and MinGW, by @tonytheodore (#2183)
+build: Install DLL with CMake on Windows, by @BioDataAnalysis (#2221)
+build: Fix DLL install location with CMake, by @xantares and @bimbashrestha (#2186)
+build: Add ZSTD_NO_UNUSED_FUNCTIONS macro to hide unused functions
+build: Add ZSTD_NO_INTRINSICS macro to avoid explicit intrinsics
+build: Add STATIC_BMI2 macro for compile time detection of BMI2 on MSVC, by @Niadb (#2258)
+build: Fix -Wcomma warnings, by @cwoffenden
+build: Remove distutils requirement for meson build, by @neheb (#2197)
+build: Fix cli compilation with uclibc
+build: Fix cli compilation without st_mtime, by @ffontaine (#2246)
+build: Fix shadowing warnings in library
+build: Fix single file library compilation with Enscripten, by @yoshihitoh (#2227)
+misc: Improve single file library and include dictBuilder, by @cwoffenden
+misc: Allow compression dictionaries with missing symbols
+misc: Add freestanding translation script in contrib/freestanding_lib
+misc: Collect all of zstd's libc dependencies into zstd_deps.h
+doc : Add ZSTD_versionString() to manual, by @animalize
+doc : Fix documentation for ZSTD_CCtxParams_setParameter(), by @felixhandte (#2270)
+
+v1.4.5  (May 22, 2020)
+fix : Compression ratio regression on huge files (> 3 GB) using high levels (--ultra) and multithreading, by @terrelln
+perf: Improved decompression speed: x64 : +10% (clang) / +5% (gcc); ARM : from +15% to +50%, depending on SoC, by @terrelln
+perf: Automatically downsizes ZSTD_DCtx when too large for too long (#2069, by @bimbashreshta)
+perf: Improved fast compression speed on aarch64 (#2040, ~+3%, by @caoyzh)
+perf: Small level 1 compression speed gains (depending on compiler)
+cli : New --patch-from command, create and apply patches from files, by @bimbashreshta
+cli : New --filelist= : Provide a list of files to operate upon from a file
+cli : -b -d command can now benchmark decompression on multiple files
+cli : New --no-content-size command
+cli : New --show-default-cparams information command
+api : ZDICT_finalizeDictionary() is promoted to stable (#2111)
+api : new experimental parameter ZSTD_d_stableOutBuffer (#2094)
+build: Generate a single-file libzstd library (#2065, by @cwoffenden)
+build: Relative includes no longer require -I compiler flags for zstd lib subdirs (#2103, by @felixhandte)
+build: zstd now compiles cleanly under -pedantic (#2099)
+build: zstd now compiles with make-4.3
+build: Support mingw cross-compilation from Linux, by @Ericson2314
+build: Meson multi-thread build fix on windows
+build: Some misc icc fixes backed by new ci test on travis
+misc: bitflip analyzer tool, by @felixhandte
+misc: Extend largeNbDicts benchmark to compression
+misc: Edit-distance match finder in contrib/
+doc : Improved beginner CONTRIBUTING.md docs
+doc : New issue templates for zstd
+
+v1.4.4  (Nov 6, 2019)
+perf: Improved decompression speed, by > 10%, by @terrelln
+perf: Better compression speed when re-using a context, by @felixhandte
+perf: Fix compression ratio when compressing large files with small dictionary, by @senhuang42
+perf: zstd reference encoder can generate RLE blocks, by @bimbashrestha
+perf: minor generic speed optimization, by @davidbolvansky
+api: new ability to extract sequences from the parser for analysis, by @bimbashrestha
+api: fixed decoding of magic-less frames, by @terrelln
+api: fixed ZSTD_initCStream_advanced() performance with fast modes, reported by @QrczakMK
+cli: Named pipes support, by @bimbashrestha
+cli: short tar's extension support, by @stokito
+cli: command --output-dir-flat= , generates target files into requested directory, by @senhuang42
+cli: commands --stream-size=# and --size-hint=#, by @nmagerko
+cli: command --exclude-compressed, by @shashank0791
+cli: faster `-t` test mode
+cli: improved some error messages, by @vangyzen
+cli: fix command `-D dictionary` on Windows, reported by @artyompetrov
+cli: fix rare deadlock condition within dictionary builder, by @terrelln
+build: single-file decoder with emscripten compilation script, by @cwoffenden
+build: fixed zlibWrapper compilation on Visual Studio, reported by @bluenlive
+build: fixed deprecation warning for certain gcc version, reported by @jasonma163
+build: fix compilation on old gcc versions, by @cemeyer
+build: improved installation directories for cmake script, by Dmitri Shubin
+pack: modified pkgconfig, for better integration into openwrt, requested by @neheb
+misc: Improved documentation : ZSTD_CLEVEL, DYNAMIC_BMI2, ZSTD_CDict, function deprecation, zstd format
+misc: fixed educational decoder : accept larger literals section, and removed UNALIGNED() macro
+
+v1.4.3  (Aug 20, 2019)
+bug: Fix Dictionary Compression Ratio Regression by @cyan4973 (#1709)
+bug: Fix Buffer Overflow in legacy v0.3 decompression by @felixhandte (#1722)
+build: Add support for IAR C/C++ Compiler for Arm by @joseph0918 (#1705)
+
+v1.4.2  (Jul 26, 2019)
+bug: Fix bug in zstd-0.5 decoder by @terrelln (#1696)
+bug: Fix seekable decompression in-memory API by @iburinoc (#1695)
+misc: Validate blocks are smaller than size limit by @vivekmg (#1685)
+misc: Restructure source files by @ephiepark (#1679)
+
+v1.4.1  (Jul 20, 2019)
+bug: Fix data corruption in niche use cases by @terrelln (#1659)
+bug: Fuzz legacy modes, fix uncovered bugs by @terrelln (#1593, #1594, #1595)
+bug: Fix out of bounds read by @terrelln (#1590)
+perf: Improve decode speed by ~7% @mgrice (#1668)
+perf: Slightly improved compression ratio of level 3 and 4 (ZSTD_dfast) by @cyan4973 (#1681)
+perf: Slightly faster compression speed when re-using a context by @cyan4973 (#1658)
+perf: Improve compression ratio for small windowLog by @cyan4973 (#1624)
+perf: Faster compression speed in high compression mode for repetitive data by @terrelln (#1635)
+api: Add parameter to generate smaller dictionaries by @tyler-tran (#1656)
+cli: Recognize symlinks when built in C99 mode by @felixhandte (#1640)
+cli: Expose cpu load indicator for each file on -vv mode by @ephiepark (#1631)
+cli: Restrict read permissions on destination files by @chungy (#1644)
+cli: zstdgrep: handle -f flag by @felixhandte (#1618)
+cli: zstdcat: follow symlinks by @vejnar (#1604)
+doc: Remove extra size limit on compressed blocks by @felixhandte (#1689)
+doc: Fix typo by @yk-tanigawa (#1633)
+doc: Improve documentation on streaming buffer sizes by @cyan4973 (#1629)
+build: CMake: support building with LZ4 @leeyoung624 (#1626)
+build: CMake: install zstdless and zstdgrep by @leeyoung624 (#1647)
+build: CMake: respect existing uninstall target by @j301scott (#1619)
+build: Make: skip multithread tests when built without support by @michaelforney (#1620)
+build: Make: Fix examples/ test target by @sjnam (#1603)
+build: Meson: rename options out of deprecated namespace by @lzutao (#1665)
+build: Meson: fix build by @lzutao (#1602)
+build: Visual Studio: don't export symbols in static lib by @scharan (#1650)
+build: Visual Studio: fix linking by @absotively (#1639)
+build: Fix MinGW-W64 build by @myzhang1029 (#1600)
+misc: Expand decodecorpus coverage by @ephiepark (#1664)
+
+v1.4.0  (Apr 17, 2019)
+perf: Improve level 1 compression speed in most scenarios by 6% by @gbtucker and @terrelln
+api: Move the advanced API, including all functions in the staging section, to the stable section
+api: Make ZSTD_e_flush and ZSTD_e_end block for maximum forward progress
+api: Rename ZSTD_CCtxParam_getParameter to ZSTD_CCtxParams_getParameter
+api: Rename ZSTD_CCtxParam_setParameter to ZSTD_CCtxParams_setParameter
+api: Don't export ZSTDMT functions from the shared library by default
+api: Require ZSTD_MULTITHREAD to be defined to use ZSTDMT
+api: Add ZSTD_decompressBound() to provide an upper bound on decompressed size by @shakeelrao
+api: Fix ZSTD_decompressDCtx() corner cases with a dictionary
+api: Move ZSTD_getDictID_*() functions to the stable section
+api: Add ZSTD_c_literalCompressionMode flag to enable or disable literal compression by @terrelln
+api: Allow compression parameters to be set when a dictionary is used
+api: Allow setting parameters before or after ZSTD_CCtx_loadDictionary() is called
+api: Fix ZSTD_estimateCStreamSize_usingCCtxParams()
+api: Setting ZSTD_d_maxWindowLog to 0 means use the default
+cli: Ensure that a dictionary is not used to compress itself by @shakeelrao
+cli: Add --[no-]compress-literals flag to enable or disable literal compression
+doc: Update the examples to use the advanced API
+doc: Explain how to transition from old streaming functions to the advanced API in the header
+build: Improve the Windows release packages
+build: Improve CMake build by @hjmjohnson
+build: Build fixes for FreeBSD by @lwhsu
+build: Remove redundant warnings by @thatsafunnyname
+build: Fix tests on OpenBSD by @bket
+build: Extend fuzzer build system to work with the new clang engine
+build: CMake now creates the libzstd.so.1 symlink
+build: Improve Menson build by @lzutao
+misc: Fix symbolic link detection on FreeBSD
+misc: Use physical core count for -T0 on FreeBSD by @cemeyer
+misc: Fix zstd --list on truncated files by @kostmo
+misc: Improve logging in debug mode by @felixhandte
+misc: Add CirrusCI tests by @lwhsu
+misc: Optimize dictionary memory usage in corner cases
+misc: Improve the dictionary builder on small or homogeneous data
+misc: Fix spelling across the repo by @jsoref
+
+v1.3.8  (Dec 28, 2018)
+perf: better decompression speed on large files (+7%) and cold dictionaries (+15%)
+perf: slightly better compression ratio at high compression modes
+api : finalized advanced API, last stage before "stable" status
+api : new --rsyncable mode, by @terrelln
+api : support decompression of empty frames into NULL (used to be an error) (#1385)
+build: new set of macros to build a minimal size decoder, by @felixhandte
+build: fix compilation on MIPS32, reported by @clbr (#1441)
+build: fix compilation with multiple -arch flags, by @ryandesign
+build: highly upgraded meson build, by @lzutao
+build: improved buck support, by @obelisk
+build: fix cmake script : can create debug build, by @pitrou
+build: Makefile : grep works on both colored consoles and systems without color support
+build: fixed zstd-pgo, by @bmwiedemann
+cli : support ZSTD_CLEVEL environment variable, by @yijinfb (#1423)
+cli : --no-progress flag, preserving final summary (#1371), by @terrelln
+cli : ensure destination file is not source file (#1422)
+cli : clearer error messages, especially when input file not present
+doc : clarified zstd_compression_format.md, by @ulikunitz
+misc: fixed zstdgrep, returns 1 on failure, by @lzutao
+misc: NEWS renamed as CHANGELOG, in accordance with fboss
+
+v1.3.7  (Oct 20, 2018)
+perf: slightly better decompression speed on clang (depending on hardware target)
+fix : performance of dictionary compression for small input < 4 KB at levels 9 and 10
+build: no longer build backtrace by default in release mode; restrict further automatic mode
+build: control backtrace support through build macro BACKTRACE
+misc: added man pages for zstdless and zstdgrep, by @samrussell
+
+v1.3.6  (Oct 6, 2018)
+perf: much faster dictionary builder, by @jenniferliu
+perf: faster dictionary compression on small data when using multiple contexts, by @felixhandte
+perf: faster dictionary decompression when using a very large number of dictionaries simultaneously
+cli : fix : does no longer overwrite destination when source does not exist (#1082)
+cli : new command --adapt, for automatic compression level adaptation
+api : fix : block api can be streamed with > 4 GB, reported by @catid
+api : reduced ZSTD_DDict size by 2 KB
+api : minimum negative compression level is defined, and can be queried using ZSTD_minCLevel().
+build: support Haiku target, by @korli
+build: Read Legacy format is limited to v0.5+ by default. Can be changed at compile time with macro ZSTD_LEGACY_SUPPORT.
+doc : zstd_compression_format.md updated to match wording in IETF RFC 8478
+misc: tests/paramgrill, a parameter optimizer, by @GeorgeLu97
+
+v1.3.5  (Jun 29, 2018)
+perf: much faster dictionary compression, by @felixhandte
+perf: small quality improvement for dictionary generation, by @terrelln
+perf: slightly improved high compression levels (notably level 19)
+mem : automatic memory release for long duration contexts
+cli : fix : overlapLog can be manually set
+cli : fix : decoding invalid lz4 frames
+api : fix : performance degradation for dictionary compression when using advanced API, by @terrelln
+api : change : clarify ZSTD_CCtx_reset() vs ZSTD_CCtx_resetParameters(), by @terrelln
+build: select custom libzstd scope through control macros, by @GeorgeLu97
+build: OpenBSD patch, by @bket
+build: make and make all are compatible with -j
+doc : clarify zstd_compression_format.md, updated for IETF RFC process
+misc: pzstd compatible with reproducible compilation, by @lamby
+
+v1.3.4  (Mar 27, 2018)
+perf: faster speed (especially decoding speed) on recent cpus (haswell+)
+perf: much better performance associating --long with multi-threading, by @terrelln
+perf: better compression at levels 13-15
+cli : asynchronous compression by default, for faster experience (use --single-thread for former behavior)
+cli : smoother status report in multi-threading mode
+cli : added command --fast=#, for faster compression modes
+cli : fix crash when not overwriting existing files, by Pádraig Brady (@pixelb)
+api : `nbThreads` becomes `nbWorkers` : 1 triggers asynchronous mode
+api : compression levels can be negative, for even more speed
+api : ZSTD_getFrameProgression() : get precise progress status of ZSTDMT anytime
+api : ZSTDMT can accept new compression parameters during compression
+api : implemented all advanced dictionary decompression prototypes
+build: improved meson recipe, by Shawn Landden (@shawnl)
+build: VS2017 scripts, by @HaydnTrigg
+misc: all /contrib projects fixed
+misc: added /contrib/docker script by @gyscos
+
+v1.3.3  (Dec 21, 2017)
+perf: faster zstd_opt strategy (levels 16-19)
+fix : bug #944 : multithreading with shared ditionary and large data, reported by @gsliepen
+cli : fix : content size written in header by default
+cli : fix : improved LZ4 format support, by @felixhandte
+cli : new : hidden command `-S`, to benchmark multiple files while generating one result per file
+api : fix : support large skippable frames, by @terrelln
+api : fix : streaming interface was adding a useless 3-bytes null block to small frames
+api : change : when setting `pledgedSrcSize`, use `ZSTD_CONTENTSIZE_UNKNOWN` macro value to mean "unknown"
+build: fix : compilation under rhel6 and centos6, reported by @pixelb
+build: added `check` target
+
+v1.3.2  (Oct 10, 2017)
+new : long range mode, using --long command, by Stella Lau (@stellamplau)
+new : ability to generate and decode magicless frames (#591)
+changed : maximum nb of threads reduced to 200, to avoid address space exhaustion in 32-bits mode
+fix : multi-threading compression works with custom allocators
+fix : ZSTD_sizeof_CStream() was over-evaluating memory usage
+fix : a rare compression bug when compression generates very large distances and bunch of other conditions (only possible at --ultra -22)
+fix : 32-bits build can now decode large offsets (levels 21+)
+cli : added LZ4 frame support by default, by Felix Handte (@felixhandte)
+cli : improved --list output
+cli : new : can split input file for dictionary training, using command -B#
+cli : new : clean operation artefact on Ctrl-C interruption
+cli : fix : do not change /dev/null permissions when using command -t with root access, reported by @mike155 (#851)
+cli : fix : write file size in header in multiple-files mode
+api : added macro ZSTD_COMPRESSBOUND() for static allocation
+api : experimental : new advanced decompression API
+api : fix : sizeof_CCtx() used to over-estimate
+build: fix : no-multithread variant compiles without pool.c dependency, reported by Mitchell Blank Jr (@mitchblank) (#819)
+build: better compatibility with reproducible builds, by Bernhard M. Wiedemann (@bmwiedemann) (#818)
+example : added streaming_memory_usage
+license : changed /examples license to BSD + GPLv2
+license : fix a few header files to reflect new license (#825)
+
+v1.3.1  (Aug 21, 2017)
+New license : BSD + GPLv2
+perf: substantially decreased memory usage in Multi-threading mode, thanks to reports by Tino Reichardt (@mcmilk)
+perf: Multi-threading supports up to 256 threads. Cap at 256 when more are requested (#760)
+cli : improved and fixed --list command, by @ib (#772)
+cli : command -vV to list supported formats, by @ib (#771)
+build : fixed binary variants, reported by @svenha (#788)
+build : fix Visual compilation for non x86/x64 targets, reported by Greg Slazinski (@GregSlazinski) (#718)
+API exp : breaking change : ZSTD_getframeHeader() provides more information
+API exp : breaking change : pinned down values of error codes
+doc : fixed huffman example, by Ulrich Kunitz (@ulikunitz)
+new : contrib/adaptive-compression, I/O driven compression strength, by Paul Cruz (@paulcruz74)
+new : contrib/long_distance_matching, statistics by Stella Lau (@stellamplau)
+updated : contrib/linux-kernel, by Nick Terrell (@terrelln)
+
+v1.3.0  (Jul 6, 2017)
+cli : new : `--list` command, by Paul Cruz
+cli : changed : xz/lzma support enabled by default
+cli : changed : `-t *` continue processing list after a decompression error
+API : added : ZSTD_versionString()
+API : promoted to stable status : ZSTD_getFrameContentSize(), by Sean Purcell
+API exp : new advanced API : ZSTD_compress_generic(), ZSTD_CCtx_setParameter()
+API exp : new : API for static or external allocation : ZSTD_initStatic?Ctx()
+API exp : added : ZSTD_decompressBegin_usingDDict(), requested by Guy Riddle (#700)
+API exp : clarified memory estimation / measurement functions.
+API exp : changed : strongest strategy renamed ZSTD_btultra, fastest strategy ZSTD_fast set to 1
+tools : decodecorpus can generate random dictionary-compressed samples, by Paul Cruz
+new : contrib/seekable_format, demo and API, by Sean Purcell
+changed : contrib/linux-kernel, updated version and license, by Nick Terrell
+
+v1.2.0  (May 5, 2017)
+cli : changed : Multithreading enabled by default (use target zstd-nomt or HAVE_THREAD=0 to disable)
+cli : new : command -T0 means "detect and use nb of cores", by Sean Purcell
+cli : new : zstdmt symlink hardwired to `zstd -T0`
+cli : new : command --threads=# (#671)
+cli : changed : cover dictionary builder by default, for improved quality, by Nick Terrell
+cli : new : commands --train-cover and --train-legacy, to select dictionary algorithm and parameters
+cli : experimental targets `zstd4` and `xzstd4`, with support for lz4 format, by Sean Purcell
+cli : fix : does not output compressed data on console
+cli : fix : ignore symbolic links unless --force specified,
+API : breaking change : ZSTD_createCDict_advanced(), only use compressionParameters as argument
+API : added : prototypes ZSTD_*_usingCDict_advanced(), for direct control over frameParameters.
+API : improved: ZSTDMT_compressCCtx() reduced memory usage
+API : fix : ZSTDMT_compressCCtx() now provides srcSize in header (#634)
+API : fix : src size stored in frame header is controlled at end of frame
+API : fix : enforced consistent rules for pledgedSrcSize==0 (#641)
+API : fix : error code "GENERIC" replaced by "dstSizeTooSmall" when appropriate
+build: improved cmake script, by @Majlen
+build: enabled Multi-threading support for *BSD, by Baptiste Daroussin
+tools: updated Paramgrill. Command -O# provides best parameters for sample and speed target.
+new : contrib/linux-kernel version, by Nick Terrell
+
+v1.1.4  (Mar 18, 2017)
+cli : new : can compress in *.gz format, using --format=gzip command, by Przemyslaw Skibinski
+cli : new : advanced benchmark command --priority=rt
+cli : fix : write on sparse-enabled file systems in 32-bits mode, by @ds77
+cli : fix : --rm remains silent when input is stdin
+cli : experimental : xzstd, with support for xz/lzma decoding, by Przemyslaw Skibinski
+speed : improved decompression speed in streaming mode for single shot scenarios (+5%)
+memory: DDict (decompression dictionary) memory usage down from 150 KB to 20 KB
+arch: 32-bits variant able to generate and decode very long matches (>32 MB), by Sean Purcell
+API : new : ZSTD_findFrameCompressedSize(), ZSTD_getFrameContentSize(), ZSTD_findDecompressedSize()
+API : changed : dropped support of legacy versions <= v0.3 (can be changed by modifying ZSTD_LEGACY_SUPPORT value)
+build : new: meson build system in contrib/meson, by Dima Krasner
+build : improved cmake script, by @Majlen
+build : added -Wformat-security flag, as recommended by Padraig Brady
+doc : new : educational decoder, by Sean Purcell
+
+v1.1.3  (Feb 7, 2017)
+cli : zstd can decompress .gz files (can be disabled with `make zstd-nogz` or `make HAVE_ZLIB=0`)
+cli : new : experimental target `make zstdmt`, with multi-threading support
+cli : new : improved dictionary builder "cover" (experimental), by Nick Terrell, based on prior work by Giuseppe Ottaviano.
+cli : new : advanced commands for detailed parameters, by Przemyslaw Skibinski
+cli : fix zstdless on Mac OS-X, by Andrew Janke
+cli : fix #232 "compress non-files"
+dictBuilder : improved dictionary generation quality, thanks to Nick Terrell
+API : new : lib/compress/ZSTDMT_compress.h multithreading API (experimental)
+API : new : ZSTD_create?Dict_byReference(), requested by Bartosz Taudul
+API : new : ZDICT_finalizeDictionary()
+API : fix : ZSTD_initCStream_usingCDict() properly writes dictID into frame header, by Gregory Szorc (#511)
+API : fix : all symbols properly exposed in libzstd, by Nick Terrell
+build : support for Solaris target, by Przemyslaw Skibinski
+doc : clarified specification, by Sean Purcell
+
+v1.1.2  (Dec 15, 2016)
+API : streaming : decompression : changed : automatic implicit reset when chain-decoding new frames without init
+API : experimental : added : dictID retrieval functions, and ZSTD_initCStream_srcSize()
+API : zbuff : changed : prototypes now generate deprecation warnings
+lib : improved : faster decompression speed at ultra compression settings and 32-bits mode
+lib : changed : only public ZSTD_ symbols are now exposed
+lib : changed : reduced usage  of stack memory
+lib : fixed : several corner case bugs, by Nick Terrell
+cli : new : gzstd, experimental version able to decode .gz files, by Przemyslaw Skibinski
+cli : new : preserve file attributes
+cli : new : added zstdless and zstdgrep tools
+cli : fixed : status displays total amount decoded, even for file consisting of multiple frames (like pzstd)
+cli : fixed : zstdcat
+zlib_wrapper : added support for gz* functions, by Przemyslaw Skibinski
+install : better compatibility with FreeBSD, by Dimitry Andric
+source tree : changed : zbuff source files moved to lib/deprecated
+
+v1.1.1  (Nov 2, 2016)
+New : command -M#, --memory=, --memlimit=, --memlimit-decompress= to limit allowed memory consumption
+New : doc/zstd_manual.html, by Przemyslaw Skibinski
+Improved : slightly better compression ratio at --ultra levels (>= 20)
+Improved : better memory usage when using streaming compression API, thanks to @Rogier-5 report
+Added : API : ZSTD_initCStream_usingCDict(), ZSTD_initDStream_usingDDict() (experimental section)
+Added : example/multiple_streaming_compression.c
+Changed : zstd_errors.h is now installed within /include (and replaces errors_public.h)
+Updated man page
+Fixed : zstd-small, zstd-compress and zstd-decompress compilation targets
+
+v1.1.0  (Sep 28, 2016)
+New : contrib/pzstd, parallel version of zstd, by Nick Terrell
+added : NetBSD install target (#338)
+Improved : speed for batches of small files
+Improved : speed of zlib wrapper, by Przemyslaw Skibinski
+Changed : libzstd on Windows supports legacy formats, by Christophe Chevalier
+Fixed : CLI -d output to stdout by default when input is stdin (#322)
+Fixed : CLI correctly detects console on Mac OS-X
+Fixed : CLI supports recursive mode `-r` on Mac OS-X
+Fixed : Legacy decoders use unified error codes, reported by benrg (#341), fixed by Przemyslaw Skibinski
+Fixed : compatibility with OpenBSD, reported by Juan Francisco Cantero Hurtado (#319)
+Fixed : compatibility with Hurd, by Przemyslaw Skibinski (#365)
+Fixed : zstd-pgo, reported by octoploid (#329)
+
+v1.0.0  (Sep 1, 2016)
+Change Licensing, all project is now BSD, Copyright Facebook
+Small decompression speed improvement
+API : Streaming API supports legacy format
+API : ZDICT_getDictID(), ZSTD_sizeof_{CCtx, DCtx, CStream, DStream}(), ZSTD_setDStreamParameter()
+CLI supports legacy formats v0.4+
+Fixed : compression fails on certain huge files, reported by Jesse McGrew
+Enhanced documentation, by Przemyslaw Skibinski
+
+v0.8.1  (Aug 18, 2016)
+New streaming API
+Changed : --ultra now enables levels beyond 19
+Changed : -i# now selects benchmark time in second
+Fixed : ZSTD_compress* can now compress > 4 GB in a single pass, reported by Nick Terrell
+Fixed : speed regression on specific patterns (#272)
+Fixed : support for Z_SYNC_FLUSH, by Dmitry Krot (#291)
+Fixed : ICC compilation, by Przemyslaw Skibinski
+
+v0.8.0  (Aug 2, 2016)
+Improved : better speed on clang and gcc -O2, thanks to Eric Biggers
+New : Build on FreeBSD and DragonFly, thanks to JrMarino
+Changed : modified API : ZSTD_compressEnd()
+Fixed : legacy mode with ZSTD_HEAPMODE=0, by Christopher Bergqvist
+Fixed : premature end of frame when zero-sized raw block, reported by Eric Biggers
+Fixed : large dictionaries (> 384 KB), reported by Ilona Papava
+Fixed : checksum correctly checked in single-pass mode
+Fixed : combined --test amd --rm, reported by Andreas M. Nilsson
+Modified : minor compression level adaptations
+Updated : compression format specification to v0.2.0
+changed : zstd.h moved to /lib directory
+
+v0.7.5  (Aug 1, 2016)
+Transition version, supporting decoding of v0.8.x
+
+v0.7.4  (Jul 17, 2016)
+Added : homebrew for Mac, by Daniel Cade
+Added : more examples
+Fixed : segfault when using small dictionaries, reported by Felix Handte
+Modified : default compression level for CLI is now 3
+Updated : specification, to v0.1.1
+
+v0.7.3  (Jul 9, 2016)
+New : compression format specification
+New : `--` separator, stating that all following arguments are file names. Suggested by Chip Turner.
+New : `ZSTD_getDecompressedSize()`
+New : OpenBSD target, by Juan Francisco Cantero Hurtado
+New : `examples` directory
+fixed : dictBuilder using HC levels, reported by Bartosz Taudul
+fixed : legacy support from ZSTD_decompress_usingDDict(), reported by Felix Handte
+fixed : multi-blocks decoding with intermediate uncompressed blocks, reported by Greg Slazinski
+modified : removed "mem.h" and "error_public.h" dependencies from "zstd.h" (experimental section)
+modified : legacy functions no longer need magic number
+
+v0.7.2  (Jul 4, 2016)
+fixed : ZSTD_decompressBlock() using multiple consecutive blocks. Reported by Greg Slazinski.
+fixed : potential segfault on very large files (many gigabytes). Reported by Chip Turner.
+fixed : CLI displays system error message when destination file cannot be created (#231). Reported by Chip Turner.
+
+v0.7.1  (Jun 23, 2016)
+fixed : ZBUFF_compressEnd() called multiple times with too small `dst` buffer, reported by Christophe Chevalier
+fixed : dictBuilder fails if first sample is too small, reported by Руслан Ковалёв
+fixed : corruption issue, reported by cj
+modified : checksum enabled by default in command line mode
+
+v0.7.0  (Jun 17, 2016)
+New : Support for directory compression, using `-r`, thanks to Przemyslaw Skibinski
+New : Command `--rm`, to remove source file after successful de/compression
+New : Visual build scripts, by Christophe Chevalier
+New : Support for Sparse File-systems (do not use space for zero-filled sectors)
+New : Frame checksum support
+New : Support pass-through mode (when using `-df`)
+API : more efficient Dictionary API : `ZSTD_compress_usingCDict()`, `ZSTD_decompress_usingDDict()`
+API : create dictionary files from custom content, by Giuseppe Ottaviano
+API : support for custom malloc/free functions
+New : controllable Dictionary ID
+New : Support for skippable frames
+
+v0.6.1  (May 13, 2016)
+New : zlib wrapper API, thanks to Przemyslaw Skibinski
+New : Ability to compile compressor / decompressor separately
+Changed : new lib directory structure
+Fixed : Legacy codec v0.5 compatible with dictionary decompression
+Fixed : Decoder corruption error (#173)
+Fixed : null-string roundtrip (#176)
+New : benchmark mode can select directory as input
+Experimental : midipix support, VMS support
+
+v0.6.0  (Apr 13, 2016)
+Stronger high compression modes, thanks to Przemyslaw Skibinski
+API : ZSTD_getFrameParams() provides size of decompressed content
+New : highest compression modes require `--ultra` command to fully unleash their capacity
+Fixed : zstd cli return error code > 0 and removes dst file artifact when decompression fails, thanks to Chip Turner
+
+v0.5.1  (Feb 18, 2016)
+New : Optimal parsing => Very high compression modes, thanks to Przemyslaw Skibinski
+Changed : Dictionary builder integrated into libzstd and zstd cli
+Changed (!) : zstd cli now uses "multiple input files" as default mode. See `zstd -h`.
+Fix : high compression modes for big-endian platforms
+New : zstd cli : `-t` | `--test` command
+
+v0.5.0  (Feb 5, 2016)
+New : dictionary builder utility
+Changed : streaming & dictionary API
+Improved : better compression of small data
+
+v0.4.7  (Jan 22, 2016)
+Improved : small compression speed improvement in HC mode
+Changed : `zstd_decompress.c` has ZSTD_LEGACY_SUPPORT to 0 by default
+fix : bt search bug
+
+v0.4.6  (Jan 13, 2016)
+fix : fast compression mode on Windows
+New : cmake configuration file, thanks to Artyom Dymchenko
+Improved : high compression mode on repetitive data
+New : block-level API
+New : ZSTD_duplicateCCtx()
+
+v0.4.5  (Dec 18, 2015)
+new : -m/--multiple : compress/decompress multiple files
+
+v0.4.4  (Dec 14, 2015)
+Fixed : high compression modes for Windows 32 bits
+new : external dictionary API extended to buffered mode and accessible through command line
+new : windows DLL project, thanks to Christophe Chevalier
+
+v0.4.3  (Dec 7, 2015)
+new : external dictionary API
+new : zstd-frugal
+
+v0.4.2  (Dec 2, 2015)
+Generic minor improvements for small blocks
+Fixed : big-endian compatibility, by Peter Harris (#85)
+
+v0.4.1  (Dec 1, 2015)
+Fixed : ZSTD_LEGACY_SUPPORT=0 build mode (reported by Luben)
+removed `zstd.c`
+
+v0.4.0  (Nov 29, 2015)
+Command line utility compatible with high compression levels
+Removed zstdhc => merged into zstd
+Added : ZBUFF API (see zstd_buffered.h)
+Rolling buffer support
+
+v0.3.6  (Nov 10, 2015)
+small blocks params
+
+v0.3.5  (Nov 9, 2015)
+minor generic compression improvements
+
+v0.3.4  (Nov 6, 2015)
+Faster fast cLevels
+
+v0.3.3  (Nov 5, 2015)
+Small compression ratio improvement
+
+v0.3.2  (Nov 2, 2015)
+Fixed Visual Studio
+
+v0.3.1  (Nov 2, 2015)
+Small compression ratio improvement
+
+v0.3  (Oct 30, 2015)
+HC mode : compression levels 2-26
+
+v0.2.2  (Oct 28, 2015)
+Fix : Visual Studio 2013 & 2015 release compilation, by Christophe Chevalier
+
+v0.2.1  (Oct 24, 2015)
+Fix : Read errors, advanced fuzzer tests, by Hanno Böck
+
+v0.2.0  (Oct 22, 2015)
+**Breaking format change**
+Faster decompression speed
+Can still decode v0.1 format
+
+v0.1.3  (Oct 15, 2015)
+fix uninitialization warning, reported by Evan Nemerson
+
+v0.1.2  (Sep 11, 2015)
+frame concatenation support
+
+v0.1.1  (Aug 27, 2015)
+fix compression bug
+detects write-flush errors
+
+v0.1.0  (Aug 25, 2015)
+first release
--- a/third_party/zstd/COPYING
+++ b/third_party/zstd/COPYING
@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
--- a/third_party/zstd/LICENSE
+++ b/third_party/zstd/LICENSE
@ -0,0 +1,30 @@
+BSD License
+
+For Zstandard software
+
+Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook, nor Meta, nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/third_party/zstd/README.cosmo
+++ b/third_party/zstd/README.cosmo
@ -0,0 +1,12 @@
+DESCRIPTION
+
+  Zstandard - Fast real-time compression algorithm
+
+ORIGIN
+
+  https://github.com/facebook/zstd
+  e2cd353473650472a095aa0b50e3bd560b714402
+
+LOCAL CHANGES
+
+  Changed layout of files slightly to better suit Cosmopolitan.
--- a/third_party/zstd/README.md
+++ b/third_party/zstd/README.md
@ -0,0 +1,223 @@
+<p align="center"><img src="https://raw.githubusercontent.com/facebook/zstd/dev/doc/images/zstd_logo86.png" alt="Zstandard"></p>
+
+__Zstandard__, or `zstd` as short version, is a fast lossless compression algorithm,
+targeting real-time compression scenarios at zlib-level and better compression ratios.
+It's backed by a very fast entropy stage, provided by [Huff0 and FSE library](https://github.com/Cyan4973/FiniteStateEntropy).
+
+Zstandard's format is stable and documented in [RFC8878](https://datatracker.ietf.org/doc/html/rfc8878). Multiple independent implementations are already available.
+This repository represents the reference implementation, provided as an open-source dual [BSD](LICENSE) and [GPLv2](COPYING) licensed **C** library,
+and a command line utility producing and decoding `.zst`, `.gz`, `.xz` and `.lz4` files.
+Should your project require another programming language,
+a list of known ports and bindings is provided on [Zstandard homepage](https://facebook.github.io/zstd/#other-languages).
+
+**Development branch status:**
+
+[![Build Status][travisDevBadge]][travisLink]
+[![Build status][CircleDevBadge]][CircleLink]
+[![Build status][CirrusDevBadge]][CirrusLink]
+[![Fuzzing Status][OSSFuzzBadge]][OSSFuzzLink]
+
+[travisDevBadge]: https://api.travis-ci.com/facebook/zstd.svg?branch=dev "Continuous Integration test suite"
+[travisLink]: https://travis-ci.com/facebook/zstd
+[CircleDevBadge]: https://circleci.com/gh/facebook/zstd/tree/dev.svg?style=shield "Short test suite"
+[CircleLink]: https://circleci.com/gh/facebook/zstd
+[CirrusDevBadge]: https://api.cirrus-ci.com/github/facebook/zstd.svg?branch=dev
+[CirrusLink]: https://cirrus-ci.com/github/facebook/zstd
+[OSSFuzzBadge]: https://oss-fuzz-build-logs.storage.googleapis.com/badges/zstd.svg
+[OSSFuzzLink]: https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zstd
+
+## Benchmarks
+
+For reference, several fast compression algorithms were tested and compared
+on a desktop running Ubuntu 20.04 (`Linux 5.11.0-41-generic`),
+with a Core i7-9700K CPU @ 4.9GHz,
+using [lzbench], an open-source in-memory benchmark by @inikep
+compiled with [gcc] 9.3.0,
+on the [Silesia compression corpus].
+
+[lzbench]: https://github.com/inikep/lzbench
+[Silesia compression corpus]: https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia
+[gcc]: https://gcc.gnu.org/
+
+| Compressor name         | Ratio | Compression| Decompress.|
+| ---------------         | ------| -----------| ---------- |
+| **zstd 1.5.1 -1**       | 2.887 |   530 MB/s |  1700 MB/s |
+| [zlib] 1.2.11 -1        | 2.743 |    95 MB/s |   400 MB/s |
+| brotli 1.0.9 -0         | 2.702 |   395 MB/s |   450 MB/s |
+| **zstd 1.5.1 --fast=1** | 2.437 |   600 MB/s |  2150 MB/s |
+| **zstd 1.5.1 --fast=3** | 2.239 |   670 MB/s |  2250 MB/s |
+| quicklz 1.5.0 -1        | 2.238 |   540 MB/s |   760 MB/s |
+| **zstd 1.5.1 --fast=4** | 2.148 |   710 MB/s |  2300 MB/s |
+| lzo1x 2.10 -1           | 2.106 |   660 MB/s |   845 MB/s |
+| [lz4] 1.9.3             | 2.101 |   740 MB/s |  4500 MB/s |
+| lzf 3.6 -1              | 2.077 |   410 MB/s |   830 MB/s |
+| snappy 1.1.9            | 2.073 |   550 MB/s |  1750 MB/s |
+
+[zlib]: https://www.zlib.net/
+[lz4]: https://lz4.github.io/lz4/
+
+The negative compression levels, specified with `--fast=#`,
+offer faster compression and decompression speed
+at the cost of compression ratio (compared to level 1).
+
+Zstd can also offer stronger compression ratios at the cost of compression speed.
+Speed vs Compression trade-off is configurable by small increments.
+Decompression speed is preserved and remains roughly the same at all settings,
+a property shared by most LZ compression algorithms, such as [zlib] or lzma.
+
+The following tests were run
+on a server running Linux Debian (`Linux version 4.14.0-3-amd64`)
+with a Core i7-6700K CPU @ 4.0GHz,
+using [lzbench], an open-source in-memory benchmark by @inikep
+compiled with [gcc] 7.3.0,
+on the [Silesia compression corpus].
+
+Compression Speed vs Ratio | Decompression Speed
+---------------------------|--------------------
+![Compression Speed vs Ratio](doc/images/CSpeed2.png "Compression Speed vs Ratio") | ![Decompression Speed](doc/images/DSpeed3.png "Decompression Speed")
+
+A few other algorithms can produce higher compression ratios at slower speeds, falling outside of the graph.
+For a larger picture including slow modes, [click on this link](doc/images/DCspeed5.png).
+
+
+## The case for Small Data compression
+
+Previous charts provide results applicable to typical file and stream scenarios (several MB). Small data comes with different perspectives.
+
+The smaller the amount of data to compress, the more difficult it is to compress. This problem is common to all compression algorithms, and reason is, compression algorithms learn from past data how to compress future data. But at the beginning of a new data set, there is no "past" to build upon.
+
+To solve this situation, Zstd offers a __training mode__, which can be used to tune the algorithm for a selected type of data.
+Training Zstandard is achieved by providing it with a few samples (one file per sample). The result of this training is stored in a file called "dictionary", which must be loaded before compression and decompression.
+Using this dictionary, the compression ratio achievable on small data improves dramatically.
+
+The following example uses the `github-users` [sample set](https://github.com/facebook/zstd/releases/tag/v1.1.3), created from [github public API](https://developer.github.com/v3/users/#get-all-users).
+It consists of roughly 10K records weighing about 1KB each.
+
+Compression Ratio | Compression Speed | Decompression Speed
+------------------|-------------------|--------------------
+![Compression Ratio](doc/images/dict-cr.png "Compression Ratio") | ![Compression Speed](doc/images/dict-cs.png "Compression Speed") | ![Decompression Speed](doc/images/dict-ds.png "Decompression Speed")
+
+
+These compression gains are achieved while simultaneously providing _faster_ compression and decompression speeds.
+
+Training works if there is some correlation in a family of small data samples. The more data-specific a dictionary is, the more efficient it is (there is no _universal dictionary_).
+Hence, deploying one dictionary per type of data will provide the greatest benefits.
+Dictionary gains are mostly effective in the first few KB. Then, the compression algorithm will gradually use previously decoded content to better compress the rest of the file.
+
+### Dictionary compression How To:
+
+1. Create the dictionary
+
+   `zstd --train FullPathToTrainingSet/* -o dictionaryName`
+
+2. Compress with dictionary
+
+   `zstd -D dictionaryName FILE`
+
+3. Decompress with dictionary
+
+   `zstd -D dictionaryName --decompress FILE.zst`
+
+
+## Build instructions
+
+`make` is the officially maintained build system of this project.
+All other build systems are "compatible" and 3rd-party maintained,
+they may feature small differences in advanced options.
+When your system allows it, prefer using `make` to build `zstd` and `libzstd`.
+
+### Makefile
+
+If your system is compatible with standard `make` (or `gmake`),
+invoking `make` in root directory will generate `zstd` cli in root directory.
+It will also create `libzstd` into `lib/`.
+
+Other available options include:
+- `make install` : create and install zstd cli, library and man pages
+- `make check` : create and run `zstd`, test its behavior on local platform
+
+The `Makefile` follows the [GNU Standard Makefile conventions](https://www.gnu.org/prep/standards/html_node/Makefile-Conventions.html),
+allowing staged install, standard flags, directory variables and command variables.
+
+For advanced use cases, specialized compilation flags which control binary generation
+are documented in [`lib/README.md`](lib/README.md#modular-build) for the `libzstd` library
+and in [`programs/README.md`](programs/README.md#compilation-variables) for the `zstd` CLI.
+
+### cmake
+
+A `cmake` project generator is provided within `build/cmake`.
+It can generate Makefiles or other build scripts
+to create `zstd` binary, and `libzstd` dynamic and static libraries.
+
+By default, `CMAKE_BUILD_TYPE` is set to `Release`.
+
+#### Support for Fat (Universal2) Output
+
+`zstd` can be built and installed with support for both Apple Silicon (M1/M2) as well as Intel by using CMake's Universal2 support.
+To perform a Fat/Universal2 build and install use the following commands:
+
+```bash
+cmake -B build-cmake-debug -S build/cmake -G Ninja -DCMAKE_OSX_ARCHITECTURES="x86_64;x86_64h;arm64"
+cd build-cmake-debug
+ninja
+sudo ninja install
+```
+
+### Meson
+
+A Meson project is provided within [`build/meson`](build/meson). Follow
+build instructions in that directory.
+
+You can also take a look at [`.travis.yml`](.travis.yml) file for an
+example about how Meson is used to build this project.
+
+Note that default build type is **release**.
+
+### VCPKG
+You can build and install zstd [vcpkg](https://github.com/Microsoft/vcpkg/) dependency manager:
+
+    git clone https://github.com/Microsoft/vcpkg.git
+    cd vcpkg
+    ./bootstrap-vcpkg.sh
+    ./vcpkg integrate install
+    ./vcpkg install zstd
+
+The zstd port in vcpkg is kept up to date by Microsoft team members and community contributors.
+If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
+
+### Visual Studio (Windows)
+
+Going into `build` directory, you will find additional possibilities:
+- Projects for Visual Studio 2005, 2008 and 2010.
+  + VS2010 project is compatible with VS2012, VS2013, VS2015 and VS2017.
+- Automated build scripts for Visual compiler by [@KrzysFR](https://github.com/KrzysFR), in `build/VS_scripts`,
+  which will build `zstd` cli and `libzstd` library without any need to open Visual Studio solution.
+
+### Buck
+
+You can build the zstd binary via buck by executing: `buck build programs:zstd` from the root of the repo.
+The output binary will be in `buck-out/gen/programs/`.
+
+## Testing
+
+You can run quick local smoke tests by running `make check`.
+If you can't use `make`, execute the `playTest.sh` script from the `src/tests` directory.
+Two env variables `$ZSTD_BIN` and `$DATAGEN_BIN` are needed for the test script to locate the `zstd` and `datagen` binary.
+For information on CI testing, please refer to `TESTING.md`.
+
+## Status
+
+Zstandard is currently deployed within Facebook and many other large cloud infrastructures.
+It is run continuously to compress large amounts of data in multiple formats and use cases.
+Zstandard is considered safe for production environments.
+
+## License
+
+Zstandard is dual-licensed under [BSD](LICENSE) and [GPLv2](COPYING).
+
+## Contributing
+
+The `dev` branch is the one where all contributions are merged before reaching `release`.
+If you plan to propose a patch, please commit into the `dev` branch, or its own feature branch.
+Direct commit to `release` are not permitted.
+For more information, please read [CONTRIBUTING](CONTRIBUTING.md).
--- a/third_party/zstd/lib/common/allocations.h
+++ b/third_party/zstd/lib/common/allocations.h
@ -0,0 +1,56 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This file provides custom allocation primitives
+ */
+
+#define ZSTD_DEPS_NEED_MALLOC
+#include "third_party/zstd/lib/common/zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+
+#include "third_party/zstd/lib/common/compiler.h" /* MEM_STATIC */
+#define ZSTD_STATIC_LINKING_ONLY
+#include "third_party/zstd/zstd.h" /* ZSTD_customMem */
+
+#ifndef ZSTD_ALLOCATIONS_H
+#define ZSTD_ALLOCATIONS_H
+
+/* custom memory allocation functions */
+
+MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc)
+        return customMem.customAlloc(customMem.opaque, size);
+    return ZSTD_malloc(size);
+}
+
+MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc) {
+        /* calloc implemented as malloc+memset;
+         * not as efficient as calloc, but next best guess for custom malloc */
+        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+        ZSTD_memset(ptr, 0, size);
+        return ptr;
+    }
+    return ZSTD_calloc(1, size);
+}
+
+MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+{
+    if (ptr!=NULL) {
+        if (customMem.customFree)
+            customMem.customFree(customMem.opaque, ptr);
+        else
+            ZSTD_free(ptr);
+    }
+}
+
+#endif /* ZSTD_ALLOCATIONS_H */
--- a/third_party/zstd/lib/common/bits.h
+++ b/third_party/zstd/lib/common/bits.h
@ -0,0 +1,202 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_BITS_H
+#define ZSTD_BITS_H
+
+#include "third_party/zstd/lib/common/mem.h"
+#include "libc/assert.h"
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
+{
+    assert(val != 0);
+    {
+        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
+                                                30, 22, 20, 15, 25, 17, 4, 8,
+                                                31, 27, 13, 23, 21, 19, 16, 7,
+                                                26, 12, 18, 6, 11, 5, 10, 9};
+        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if defined(_MSC_VER)
+#       if STATIC_BMI2 == 1
+            return (unsigned)_tzcnt_u32(val);
+#       else
+            if (val != 0) {
+                unsigned long r;
+                _BitScanForward(&r, val);
+                return (unsigned)r;
+            } else {
+                /* Should not reach this code path */
+                __assume(0);
+            }
+#       endif
+#   elif defined(__GNUC__) && (__GNUC__ >= 4)
+        return (unsigned)__builtin_ctz(val);
+#   else
+        return ZSTD_countTrailingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
+    assert(val != 0);
+    {
+        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
+                                            11, 14, 16, 18, 22, 25, 3, 30,
+                                            8, 12, 20, 28, 15, 17, 24, 7,
+                                            19, 27, 23, 6, 26, 5, 4, 31};
+        val |= val >> 1;
+        val |= val >> 2;
+        val |= val >> 4;
+        val |= val >> 8;
+        val |= val >> 16;
+        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if defined(_MSC_VER)
+#       if STATIC_BMI2 == 1
+            return (unsigned)_lzcnt_u32(val);
+#       else
+            if (val != 0) {
+                unsigned long r;
+                _BitScanReverse(&r, val);
+                return (unsigned)(31 - r);
+            } else {
+                /* Should not reach this code path */
+                __assume(0);
+            }
+#       endif
+#   elif defined(__GNUC__) && (__GNUC__ >= 4)
+        return (unsigned)__builtin_clz(val);
+#   else
+        return ZSTD_countLeadingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if defined(_MSC_VER) && defined(_WIN64)
+#       if STATIC_BMI2 == 1
+            return (unsigned)_tzcnt_u64(val);
+#       else
+            if (val != 0) {
+                unsigned long r;
+                _BitScanForward64(&r, val);
+                return (unsigned)r;
+            } else {
+                /* Should not reach this code path */
+                __assume(0);
+            }
+#       endif
+#   elif defined(__GNUC__) && (__GNUC__ >= 4) && defined(__LP64__)
+        return (unsigned)__builtin_ctzll(val);
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (leastSignificantWord == 0) {
+                return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
+            } else {
+                return ZSTD_countTrailingZeros32(leastSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if defined(_MSC_VER) && defined(_WIN64)
+#       if STATIC_BMI2 == 1
+            return (unsigned)_lzcnt_u64(val);
+#       else
+            if (val != 0) {
+                unsigned long r;
+                _BitScanReverse64(&r, val);
+                return (unsigned)(63 - r);
+            } else {
+                /* Should not reach this code path */
+                __assume(0);
+            }
+#       endif
+#   elif defined(__GNUC__) && (__GNUC__ >= 4)
+        return (unsigned)(__builtin_clzll(val));
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (mostSignificantWord == 0) {
+                return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
+            } else {
+                return ZSTD_countLeadingZeros32(mostSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+            return ZSTD_countTrailingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countTrailingZeros32((U32)val) >> 3;
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+            return ZSTD_countLeadingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countLeadingZeros32((U32)val) >> 3;
+        }
+    }
+}
+
+MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+{
+    assert(val != 0);
+    return 31 - ZSTD_countLeadingZeros32(val);
+}
+
+/* ZSTD_rotateRight_*():
+ * Rotates a bitfield to the right by "count" bits.
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+ */
+MEM_STATIC
+U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+    assert(count < 64);
+    count &= 0x3F; /* for fickle pattern recognition */
+    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+}
+
+MEM_STATIC
+U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+    assert(count < 32);
+    count &= 0x1F; /* for fickle pattern recognition */
+    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+}
+
+MEM_STATIC
+U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+    assert(count < 16);
+    count &= 0x0F; /* for fickle pattern recognition */
+    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
+}
+
+#endif /* ZSTD_BITS_H */
--- a/third_party/zstd/lib/common/bitstream.h
+++ b/third_party/zstd/lib/common/bitstream.h
@ -0,0 +1,458 @@
+// clang-format off
+/* ******************************************************************
+ * bitstream
+ * Part of FSE library
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include "third_party/zstd/lib/common/mem.h"            /* unaligned access routines */
+#include "third_party/zstd/lib/common/compiler.h"       /* UNLIKELY() */
+#include "third_party/zstd/lib/common/debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
+#include "third_party/zstd/lib/common/error_private.h"  /* error codes and messages */
+#include "third_party/zstd/lib/common/bits.h"           /* ZSTD_highbit32 */
+
+
+/*=========================================
+*  Target specific
+=========================================*/
+#ifndef ZSTD_NO_INTRINSICS
+#  if (defined(__BMI__) || defined(__BMI2__)) && defined(__GNUC__)
+#include "third_party/intel/immintrin.internal.h"   /* support for bextr (experimental)/bzhi */
+#  elif defined(__ICCARM__)
+// MISSING #include <intrinsics.h>
+#  endif
+#endif
+
+#define STREAM_ACCUMULATOR_MIN_32  25
+#define STREAM_ACCUMULATOR_MIN_64  57
+#define STREAM_ACCUMULATOR_MIN    ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+
+/*-******************************************
+*  bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+ * A critical property of these streams is that they encode and decode in **reverse** direction.
+ * So the first bit sequence you add will be the last to be read, like a LIFO stack.
+ */
+typedef struct {
+    size_t bitContainer;
+    unsigned bitPos;
+    char*  startPtr;
+    char*  ptr;
+    char*  endPtr;
+} BIT_CStream_t;
+
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
+MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+MEM_STATIC void   BIT_flushBits(BIT_CStream_t* bitC);
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+*  bitStream will never write outside of this buffer.
+*  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+*  bits are first added to a local register.
+*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+*  Writing data into memory is an explicit operation, performed by the flushBits function.
+*  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+*  After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+*  Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+*  Last operation is to close the bitStream.
+*  The function returns the final size of CStream in bytes.
+*  If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef size_t BitContainerType;
+typedef struct {
+    BitContainerType bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+    const char* limitPtr;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,  /* fully refilled */
+               BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */
+               BIT_DStream_completed = 2,   /* bitstream entirely consumed, bit-exact */
+               BIT_DStream_overflow = 3     /* user requested more bits than present in bitstream */
+    } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+/* Start by invoking BIT_initDStream().
+*  A chunk of the bitStream is then stored into a local register.
+*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType).
+*  You can then retrieve bitFields stored into the local register, **in reverse order**.
+*  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+*  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+*  Otherwise, it can be less than that, so proceed accordingly.
+*  Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+/* unsafe version; does not check buffer overflow */
+
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+/*=====    Local Constants   =====*/
+static const unsigned BIT_mask[] = {
+    0,          1,         3,         7,         0xF,       0x1F,
+    0x3F,       0x7F,      0xFF,      0x1FF,     0x3FF,     0x7FF,
+    0xFFF,      0x1FFF,    0x3FFF,    0x7FFF,    0xFFFF,    0x1FFFF,
+    0x3FFFF,    0x7FFFF,   0xFFFFF,   0x1FFFFF,  0x3FFFFF,  0x7FFFFF,
+    0xFFFFFF,   0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF,
+    0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */
+#define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0]))
+
+/*-**************************************************************
+*  bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ *  `dstCapacity` must be > sizeof(size_t)
+ *  @return : 0 if success,
+ *            otherwise an error code (can be tested using ERR_isError()) */
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+                                  void* startPtr, size_t dstCapacity)
+{
+    bitC->bitContainer = 0;
+    bitC->bitPos = 0;
+    bitC->startPtr = (char*)startPtr;
+    bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
+    if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall);
+    return 0;
+}
+
+FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+#if defined(STATIC_BMI2) && STATIC_BMI2 == 1 && !defined(ZSTD_NO_INTRINSICS)
+    return  _bzhi_u64(bitContainer, nbBits);
+#else
+    assert(nbBits < BIT_MASK_SIZE);
+    return bitContainer & BIT_mask[nbBits];
+#endif
+}
+
+/*! BIT_addBits() :
+ *  can add up to 31 bits into `bitC`.
+ *  Note : does not check for register overflow ! */
+MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+                            size_t value, unsigned nbBits)
+{
+    DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+    assert(nbBits < BIT_MASK_SIZE);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ *  works only if `value` is _clean_,
+ *  meaning all high bits above nbBits are 0 */
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
+                                size_t value, unsigned nbBits)
+{
+    assert((value>>nbBits) == 0);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= value << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ *  assumption : bitContainer has not overflowed
+ *  unsafe version; does not check buffer overflow */
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    assert(bitC->ptr <= bitC->endPtr);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_flushBits() :
+ *  assumption : bitContainer has not overflowed
+ *  safe version; check for buffer overflow, and prevents it.
+ *  note : does not signal buffer overflow.
+ *  overflow will be revealed later on using BIT_closeCStream() */
+MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    assert(bitC->ptr <= bitC->endPtr);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_closeCStream() :
+ *  @return : size of CStream, in bytes,
+ *            or 0 if it could not fit into dstBuffer */
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
+{
+    BIT_addBitsFast(bitC, 1, 1);   /* endMark */
+    BIT_flushBits(bitC);
+    if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+    return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+
+/*-********************************************************
+*  bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+ *  Initialize a BIT_DStream_t.
+ * `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+ * `srcSize` must be the *exact* size of the bitStream, in bytes.
+ * @return : size of stream (== srcSize), or an errorCode if a problem is detected
+ */
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { ZSTD_memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    bitD->start = (const char*)srcBuffer;
+    bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer);
+
+    if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+    } else {
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+        case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                ZSTD_FALLTHROUGH;
+
+        case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                ZSTD_FALLTHROUGH;
+
+        case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                ZSTD_FALLTHROUGH;
+
+        case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24;
+                ZSTD_FALLTHROUGH;
+
+        case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16;
+                ZSTD_FALLTHROUGH;
+
+        case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                ZSTD_FALLTHROUGH;
+
+        default: break;
+        }
+        {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+            if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+        }
+        bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start)
+{
+    return bitContainer >> start;
+}
+
+FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits)
+{
+    U32 const regMask = sizeof(bitContainer)*8 - 1;
+    /* if start > regMask, bitstream is corrupted, and result is undefined */
+    assert(nbBits < BIT_MASK_SIZE);
+    /* x86 transform & ((1 << nbBits) - 1) to bzhi instruction, it is better
+     * than accessing memory. When bmi2 instruction is not present, we consider
+     * such cpus old (pre-Haswell, 2013) and their performance is not of that
+     * importance.
+     */
+#if defined(__x86_64__) || defined(_M_X86)
+    return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1);
+#else
+    return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
+#endif
+}
+
+/*! BIT_lookBits() :
+ *  Provides next n bits from local register.
+ *  local register is not modified.
+ *  On 32-bits, maxNbBits==24.
+ *  On 64-bits, maxNbBits==56.
+ * @return : value extracted */
+FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
+{
+    /* arbitrate between double-shift and shift+mask */
+#if 1
+    /* if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8,
+     * bitstream is likely corrupted, and result is undefined */
+    return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits);
+#else
+    /* this code path is slower on my os-x laptop */
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask);
+#endif
+}
+
+/*! BIT_lookBitsFast() :
+ *  unsafe version; only works if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+{
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    assert(nbBits >= 1);
+    return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+}
+
+FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+/*! BIT_readBits() :
+ *  Read (consume) next n bits from local register and update.
+ *  Pay attention to not read more than nbBits contained into local register.
+ * @return : extracted value. */
+FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
+{
+    size_t const value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_readBitsFast() :
+ *  unsafe version; only works if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+{
+    size_t const value = BIT_lookBitsFast(bitD, nbBits);
+    assert(nbBits >= 1);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_reloadDStream_internal() :
+ *  Simple variant of BIT_reloadDStream(), with two conditions:
+ *  1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8
+ *  2. look window is valid after shifted down : bitD->ptr >= bitD->start
+ */
+MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD)
+{
+    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+    bitD->ptr -= bitD->bitsConsumed >> 3;
+    assert(bitD->ptr >= bitD->start);
+    bitD->bitsConsumed &= 7;
+    bitD->bitContainer = MEM_readLEST(bitD->ptr);
+    return BIT_DStream_unfinished;
+}
+
+/*! BIT_reloadDStreamFast() :
+ *  Similar to BIT_reloadDStream(), but with two differences:
+ *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
+ *  2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this
+ *     point you must use BIT_reloadDStream() to reload.
+ */
+MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+{
+    if (UNLIKELY(bitD->ptr < bitD->limitPtr))
+        return BIT_DStream_overflow;
+    return BIT_reloadDStream_internal(bitD);
+}
+
+/*! BIT_reloadDStream() :
+ *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+ *  This function is safe, it guarantees it will not never beyond src buffer.
+ * @return : status of `BIT_DStream_t` internal register.
+ *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    /* note : once in overflow mode, a bitstream remains in this mode until it's reset */
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) {
+        static const BitContainerType zeroFilled = 0;
+        bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */
+        /* overflow detected, erroneous scenario or end of stream: no update */
+        return BIT_DStream_overflow;
+    }
+
+    assert(bitD->ptr >= bitD->start);
+
+    if (bitD->ptr >= bitD->limitPtr) {
+        return BIT_reloadDStream_internal(bitD);
+    }
+    if (bitD->ptr == bitD->start) {
+        /* reached end of bitStream => no update */
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    /* start < ptr < limitPtr => cautious update */
+    {   U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream() :
+ * @return : 1 if DStream has _exactly_ reached its end (all bits consumed).
+ */
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
--- a/third_party/zstd/lib/common/compiler.h
+++ b/third_party/zstd/lib/common/compiler.h
@ -0,0 +1,380 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPILER_H
+#define ZSTD_COMPILER_H
+
+#include "third_party/zstd/lib/common/portability_macros.h"
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+/* force inlining */
+
+#if !defined(ZSTD_NO_INLINE)
+#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#  define INLINE_KEYWORD inline
+#else
+#  define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__) || defined(__ICCARM__)
+#  define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define FORCE_INLINE_ATTR __forceinline
+#else
+#  define FORCE_INLINE_ATTR
+#endif
+
+#else
+
+#define INLINE_KEYWORD
+#define FORCE_INLINE_ATTR
+
+#endif
+
+/**
+  On MSVC qsort requires that functions passed into it use the __cdecl calling conversion(CC).
+  This explicitly marks such functions as __cdecl so that the code will still compile
+  if a CC other than __cdecl has been made the default.
+*/
+#if  defined(_MSC_VER)
+#  define WIN_CDECL __cdecl
+#else
+#  define WIN_CDECL
+#endif
+
+/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
+#if defined(__GNUC__)
+#  define UNUSED_ATTR __attribute__((unused))
+#else
+#  define UNUSED_ATTR
+#endif
+
+/**
+ * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+ * parameters. They must be inlined for the compiler to eliminate the constant
+ * branches.
+ */
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR
+/**
+ * HINT_INLINE is used to help the compiler generate better code. It is *not*
+ * used for "templates", so it can be tweaked based on the compilers
+ * performance.
+ *
+ * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the
+ * always_inline attribute.
+ *
+ * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline
+ * attribute.
+ */
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+#  define HINT_INLINE static INLINE_KEYWORD
+#else
+#  define HINT_INLINE FORCE_INLINE_TEMPLATE
+#endif
+
+/* "soft" inline :
+ * The compiler is free to select if it's a good idea to inline or not.
+ * The main objective is to silence compiler warnings
+ * when a defined function in included but not used.
+ *
+ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit.
+ * Updating the prefix is probably preferable, but requires a fairly large codemod,
+ * since this name is used everywhere.
+ */
+#ifndef MEM_STATIC  /* already defined in Linux Kernel mem.h */
+#if defined(__GNUC__)
+#  define MEM_STATIC static __inline UNUSED_ATTR
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+#endif
+
+/* force no inlining */
+#ifdef _MSC_VER
+#  define FORCE_NOINLINE static __declspec(noinline)
+#else
+#  if defined(__GNUC__) || defined(__ICCARM__)
+#    define FORCE_NOINLINE static __attribute__((__noinline__))
+#  else
+#    define FORCE_NOINLINE static
+#  endif
+#endif
+
+
+/* target attribute */
+#if defined(__GNUC__) || defined(__ICCARM__)
+#  define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
+#else
+#  define TARGET_ATTRIBUTE(target)
+#endif
+
+/* Target attribute for BMI2 dynamic dispatch.
+ * Enable lzcnt, bmi, and bmi2.
+ * We test for bmi1 & bmi2. lzcnt is included in bmi1.
+ */
+#define BMI2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("lzcnt,bmi,bmi2")
+
+/* prefetch
+ * can be disabled, by declaring NO_PREFETCH build macro */
+#if defined(NO_PREFETCH)
+#  define PREFETCH_L1(ptr)  (void)(ptr)  /* disabled */
+#  define PREFETCH_L2(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC)  /* _mm_prefetch() is not defined outside of x86/x64 */
+#include "third_party/intel/mmintrin.internal.h"   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#    define PREFETCH_L2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#    define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+#  elif defined(__aarch64__)
+#    define PREFETCH_L1(ptr)  __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
+#    define PREFETCH_L2(ptr)  __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
+#  else
+#    define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
+#    define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* NO_PREFETCH */
+
+#define CACHELINE_SIZE 64
+
+#define PREFETCH_AREA(p, s)  {            \
+    const char* const _ptr = (const char*)(p);  \
+    size_t const _size = (size_t)(s);     \
+    size_t _pos;                          \
+    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+        PREFETCH_L2(_ptr + _pos);         \
+    }                                     \
+}
+
+/* vectorization
+ * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
+ * and some compilers, like Intel ICC and MCST LCC, do not support it at all. */
+#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) && !defined(__LCC__)
+#  if (__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5)
+#    define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
+#  else
+#    define DONT_VECTORIZE _Pragma("GCC optimize(\"no-tree-vectorize\")")
+#  endif
+#else
+#  define DONT_VECTORIZE
+#endif
+
+/* Tell the compiler that a branch is likely or unlikely.
+ * Only use these macros if it causes the compiler to generate better code.
+ * If you can remove a LIKELY/UNLIKELY annotation without speed changes in gcc
+ * and clang, please do.
+ */
+#if defined(__GNUC__)
+#define LIKELY(x) (__builtin_expect((x), 1))
+#define UNLIKELY(x) (__builtin_expect((x), 0))
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+#if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+#  define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); }
+#else
+#  define ZSTD_UNREACHABLE { assert(0); }
+#endif
+
+/* disable warnings */
+#ifdef _MSC_VER    /* Visual Studio */
+// MISSING #include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4100)        /* disable: C4100: unreferenced formal parameter */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+/*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/
+#ifndef STATIC_BMI2
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))
+#    ifdef __AVX2__  //MSVC does not have a BMI2 specific flag, but every CPU that supports AVX2 also supports BMI2
+#       define STATIC_BMI2 1
+#    endif
+#  elif defined(__BMI2__) && defined(__x86_64__) && defined(__GNUC__)
+#    define STATIC_BMI2 1
+#  endif
+#endif
+
+#ifndef STATIC_BMI2
+    #define STATIC_BMI2 0
+#endif
+
+/* compile time determination of SIMD support */
+#if !defined(ZSTD_NO_INTRINSICS)
+#  if defined(__SSE2__) || defined(_M_AMD64) || (defined (_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
+#    define ZSTD_ARCH_X86_SSE2
+#  endif
+#  if defined(__ARM_NEON) || defined(_M_ARM64)
+#    define ZSTD_ARCH_ARM_NEON
+#  endif
+#
+#  if defined(ZSTD_ARCH_X86_SSE2)
+#include "third_party/intel/emmintrin.internal.h"
+#  elif defined(ZSTD_ARCH_ARM_NEON)
+// MISSING #include <arm_neon.h>
+#  endif
+#endif
+
+/* C-language Attributes are added in C23. */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
+# define ZSTD_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+#else
+# define ZSTD_HAS_C_ATTRIBUTE(x) 0
+#endif
+
+/* Only use C++ attributes in C++. Some compilers report support for C++
+ * attributes when compiling with C.
+ */
+#if defined(__cplusplus) && defined(__has_cpp_attribute)
+# define ZSTD_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define ZSTD_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+/* Define ZSTD_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute.
+ * - C23: https://en.cppreference.com/w/c/language/attributes/fallthrough
+ * - CPP17: https://en.cppreference.com/w/cpp/language/attributes/fallthrough
+ * - Else: __attribute__((__fallthrough__))
+ */
+#ifndef ZSTD_FALLTHROUGH
+# if ZSTD_HAS_C_ATTRIBUTE(fallthrough)
+#  define ZSTD_FALLTHROUGH [[fallthrough]]
+# elif ZSTD_HAS_CPP_ATTRIBUTE(fallthrough)
+#  define ZSTD_FALLTHROUGH [[fallthrough]]
+# elif __has_attribute(__fallthrough__)
+/* Leading semicolon is to satisfy gcc-11 with -pedantic. Without the semicolon
+ * gcc complains about: a label can only be part of a statement and a declaration is not a statement.
+ */
+#  define ZSTD_FALLTHROUGH ; __attribute__((__fallthrough__))
+# else
+#  define ZSTD_FALLTHROUGH
+# endif
+#endif
+
+/*-**************************************************************
+*  Alignment check
+*****************************************************************/
+
+/* this test was initially positioned in mem.h,
+ * but this file is removed (or replaced) for linux kernel
+ * so it's now hosted in compiler.h,
+ * which remains valid for both user & kernel spaces.
+ */
+
+#ifndef ZSTD_ALIGNOF
+# if defined(__GNUC__) || defined(_MSC_VER)
+/* covers gcc, clang & MSVC */
+/* note : this section must come first, before C11,
+ * due to a limitation in the kernel source generator */
+#  define ZSTD_ALIGNOF(T) __alignof(T)
+
+# elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+/* C11 support */
+#include "libc/stdalign.internal.h"
+#  define ZSTD_ALIGNOF(T) alignof(T)
+
+# else
+/* No known support for alignof() - imperfect backup */
+#  define ZSTD_ALIGNOF(T) (sizeof(void*) < sizeof(T) ? sizeof(void*) : sizeof(T))
+
+# endif
+#endif /* ZSTD_ALIGNOF */
+
+/*-**************************************************************
+*  Sanitizer
+*****************************************************************/
+
+/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
+ * abundance of caution, disable our custom poisoning on mingw. */
+#ifdef __MINGW32__
+#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
+#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
+#endif
+#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
+#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
+#endif
+#endif
+
+#if ZSTD_MEMORY_SANITIZER && !defined(ZSTD_MSAN_DONT_POISON_WORKSPACE)
+/* Not all platforms that support msan provide sanitizers/msan_interface.h.
+ * We therefore declare the functions we need ourselves, rather than trying to
+ * include the header file... */
+  /* size_t */
+#define ZSTD_DEPS_NEED_STDINT
+#include "third_party/zstd/lib/common/zstd_deps.h"  /* intptr_t */
+
+/* Make memory region fully initialized (without changing its contents). */
+void __msan_unpoison(const volatile void *a, size_t size);
+
+/* Make memory region fully uninitialized (without changing its contents).
+   This is a legacy interface that does not update origin information. Use
+   __msan_allocated_memory() instead. */
+void __msan_poison(const volatile void *a, size_t size);
+
+/* Returns the offset of the first (at least partially) poisoned byte in the
+   memory range, or -1 if the whole range is good. */
+intptr_t __msan_test_shadow(const volatile void *x, size_t size);
+
+/* Print shadow and origin for the memory range to stderr in a human-readable
+   format. */
+void __msan_print_shadow(const volatile void *x, size_t size);
+#endif
+
+#if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE)
+/* Not all platforms that support asan provide sanitizers/asan_interface.h.
+ * We therefore declare the functions we need ourselves, rather than trying to
+ * include the header file... */
+  /* size_t */
+
+/**
+ * Marks a memory region (<c>[addr, addr+size)</c>) as unaddressable.
+ *
+ * This memory must be previously allocated by your program. Instrumented
+ * code is forbidden from accessing addresses in this region until it is
+ * unpoisoned. This function is not guaranteed to poison the entire region -
+ * it could poison only a subregion of <c>[addr, addr+size)</c> due to ASan
+ * alignment restrictions.
+ *
+ * \note This function is not thread-safe because no two threads can poison or
+ * unpoison memory in the same memory region simultaneously.
+ *
+ * \param addr Start of memory region.
+ * \param size Size of memory region. */
+void __asan_poison_memory_region(void const volatile *addr, size_t size);
+
+/**
+ * Marks a memory region (<c>[addr, addr+size)</c>) as addressable.
+ *
+ * This memory must be previously allocated by your program. Accessing
+ * addresses in this region is allowed until this region is poisoned again.
+ * This function could unpoison a super-region of <c>[addr, addr+size)</c> due
+ * to ASan alignment restrictions.
+ *
+ * \note This function is not thread-safe because no two threads can
+ * poison or unpoison memory in the same memory region simultaneously.
+ *
+ * \param addr Start of memory region.
+ * \param size Size of memory region. */
+void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
+#endif
+
+#endif /* ZSTD_COMPILER_H */
--- a/third_party/zstd/lib/common/cpu.h
+++ b/third_party/zstd/lib/common/cpu.h
@ -0,0 +1,214 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMMON_CPU_H
+#define ZSTD_COMMON_CPU_H
+
+/**
+ * Implementation taken from folly/CpuId.h
+ * https://github.com/facebook/folly/blob/master/folly/CpuId.h
+ */
+
+#include "third_party/zstd/lib/common/mem.h"
+
+#ifdef _MSC_VER
+// MISSING #include <intrin.h>
+#endif
+
+typedef struct {
+    U32 f1c;
+    U32 f1d;
+    U32 f7b;
+    U32 f7c;
+} ZSTD_cpuid_t;
+
+MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
+    U32 f1c = 0;
+    U32 f1d = 0;
+    U32 f7b = 0;
+    U32 f7c = 0;
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+    int reg[4];
+    __cpuid((int*)reg, 0);
+    {
+        int const n = reg[0];
+        if (n >= 1) {
+            __cpuid((int*)reg, 1);
+            f1c = (U32)reg[2];
+            f1d = (U32)reg[3];
+        }
+        if (n >= 7) {
+            __cpuidex((int*)reg, 7, 0);
+            f7b = (U32)reg[1];
+            f7c = (U32)reg[2];
+        }
+    }
+#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
+    /* The following block like the normal cpuid branch below, but gcc
+     * reserves ebx for use of its pic register so we must specially
+     * handle the save and restore to avoid clobbering the register
+     */
+    U32 n;
+    __asm__(
+        "pushl %%ebx\n\t"
+        "cpuid\n\t"
+        "popl %%ebx\n\t"
+        : "=a"(n)
+        : "a"(0)
+        : "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "popl %%ebx\n\t"
+          : "=a"(f1a), "=c"(f1c), "=d"(f1d)
+          : "a"(1));
+    }
+    if (n >= 7) {
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "movl %%ebx, %%eax\n\t"
+          "popl %%ebx"
+          : "=a"(f7b), "=c"(f7c)
+          : "a"(7), "c"(0)
+          : "edx");
+    }
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
+    U32 n;
+    __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx");
+    }
+    if (n >= 7) {
+      U32 f7a;
+      __asm__("cpuid"
+              : "=a"(f7a), "=b"(f7b), "=c"(f7c)
+              : "a"(7), "c"(0)
+              : "edx");
+    }
+#endif
+    {
+        ZSTD_cpuid_t cpuid;
+        cpuid.f1c = f1c;
+        cpuid.f1d = f1d;
+        cpuid.f7b = f7b;
+        cpuid.f7c = f7c;
+        return cpuid;
+    }
+}
+
+#define X(name, r, bit)                                                        \
+  MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) {                 \
+    return ((cpuid.r) & (1U << bit)) != 0;                                     \
+  }
+
+/* cpuid(1): Processor Info and Feature Bits. */
+#define C(name, bit) X(name, f1c, bit)
+  C(sse3, 0)
+  C(pclmuldq, 1)
+  C(dtes64, 2)
+  C(monitor, 3)
+  C(dscpl, 4)
+  C(vmx, 5)
+  C(smx, 6)
+  C(eist, 7)
+  C(tm2, 8)
+  C(ssse3, 9)
+  C(cnxtid, 10)
+  C(fma, 12)
+  C(cx16, 13)
+  C(xtpr, 14)
+  C(pdcm, 15)
+  C(pcid, 17)
+  C(dca, 18)
+  C(sse41, 19)
+  C(sse42, 20)
+  C(x2apic, 21)
+  C(movbe, 22)
+  C(popcnt, 23)
+  C(tscdeadline, 24)
+  C(aes, 25)
+  C(xsave, 26)
+  C(osxsave, 27)
+  C(avx, 28)
+  C(f16c, 29)
+  C(rdrand, 30)
+#undef C
+#define D(name, bit) X(name, f1d, bit)
+  D(fpu, 0)
+  D(vme, 1)
+  D(de, 2)
+  D(pse, 3)
+  D(tsc, 4)
+  D(msr, 5)
+  D(pae, 6)
+  D(mce, 7)
+  D(cx8, 8)
+  D(apic, 9)
+  D(sep, 11)
+  D(mtrr, 12)
+  D(pge, 13)
+  D(mca, 14)
+  D(cmov, 15)
+  D(pat, 16)
+  D(pse36, 17)
+  D(psn, 18)
+  D(clfsh, 19)
+  D(ds, 21)
+  D(acpi, 22)
+  D(mmx, 23)
+  D(fxsr, 24)
+  D(sse, 25)
+  D(sse2, 26)
+  D(ss, 27)
+  D(htt, 28)
+  D(tm, 29)
+  D(pbe, 31)
+#undef D
+
+/* cpuid(7): Extended Features. */
+#define B(name, bit) X(name, f7b, bit)
+  B(bmi1, 3)
+  B(hle, 4)
+  B(avx2, 5)
+  B(smep, 7)
+  B(bmi2, 8)
+  B(erms, 9)
+  B(invpcid, 10)
+  B(rtm, 11)
+  B(mpx, 14)
+  B(avx512f, 16)
+  B(avx512dq, 17)
+  B(rdseed, 18)
+  B(adx, 19)
+  B(smap, 20)
+  B(avx512ifma, 21)
+  B(pcommit, 22)
+  B(clflushopt, 23)
+  B(clwb, 24)
+  B(avx512pf, 26)
+  B(avx512er, 27)
+  B(avx512cd, 28)
+  B(sha, 29)
+  B(avx512bw, 30)
+  B(avx512vl, 31)
+#undef B
+#define C(name, bit) X(name, f7c, bit)
+  C(prefetchwt1, 0)
+  C(avx512vbmi, 1)
+#undef C
+
+#undef X
+
+#endif /* ZSTD_COMMON_CPU_H */
--- a/third_party/zstd/lib/common/debug.c
+++ b/third_party/zstd/lib/common/debug.c
@ -0,0 +1,25 @@
+// clang-format off
+/* ******************************************************************
+ * debug
+ * Part of FSE library
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/*
+ * This module only hosts one global variable
+ * which can be used to dynamically influence the verbosity of traces,
+ * such as DEBUGLOG and RAWLOG
+ */
+
+#include "third_party/zstd/lib/common/debug.h"
+
+int g_debuglevel = DEBUGLEVEL;
--- a/third_party/zstd/lib/common/debug.h
+++ b/third_party/zstd/lib/common/debug.h
@ -0,0 +1,108 @@
+// clang-format off
+/* ******************************************************************
+ * debug
+ * Part of FSE library
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/*
+ * The purpose of this header is to enable debug functions.
+ * They regroup assert(), DEBUGLOG() and RAWLOG() for run-time,
+ * and DEBUG_STATIC_ASSERT() for compile-time.
+ *
+ * By default, DEBUGLEVEL==0, which means run-time debug is disabled.
+ *
+ * Level 1 enables assert() only.
+ * Starting level 2, traces can be generated and pushed to stderr.
+ * The higher the level, the more verbose the traces.
+ *
+ * It's possible to dynamically adjust level using variable g_debug_level,
+ * which is only declared if DEBUGLEVEL>=2,
+ * and is a global variable, not multi-thread protected (use with care)
+ */
+
+#ifndef DEBUG_H_12987983217
+#define DEBUG_H_12987983217
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* static assert is triggered at compile time, leaving no runtime artefact.
+ * static assert only works with compile-time constants.
+ * Also, this variant can only be used inside a function. */
+#define DEBUG_STATIC_ASSERT(c) (void)sizeof(char[(c) ? 1 : -1])
+
+
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+
+/* recommended values for DEBUGLEVEL :
+ * 0 : release mode, no debug, all run-time checks disabled
+ * 1 : enables assert() only, no display
+ * 2 : reserved, for currently active debug path
+ * 3 : events once per object lifetime (CCtx, CDict, etc.)
+ * 4 : events once per frame
+ * 5 : events once per block
+ * 6 : events once per sequence (verbose)
+ * 7+: events at every position (*very* verbose)
+ *
+ * It's generally inconvenient to output traces > 5.
+ * In which case, it's possible to selectively trigger high verbosity levels
+ * by modifying g_debug_level.
+ */
+
+#if (DEBUGLEVEL>=1)
+#  define ZSTD_DEPS_NEED_ASSERT
+#include "third_party/zstd/lib/common/zstd_deps.h"
+#else
+#  ifndef assert   /* assert may be already defined, due to prior #include "libc/assert.h"
+#    define assert(condition) ((void)0)   /* disable assert (default) */
+#  endif
+#endif
+
+#if (DEBUGLEVEL>=2)
+#  define ZSTD_DEPS_NEED_IO
+#include "third_party/zstd/lib/common/zstd_deps.h"
+extern int g_debuglevel; /* the variable is only declared,
+                            it actually lives in debug.c,
+                            and is shared by the whole process.
+                            It's not thread-safe.
+                            It's useful when enabling very verbose levels
+                            on selective conditions (such as position in src) */
+
+#  define RAWLOG(l, ...) {                                       \
+                if (l<=g_debuglevel) {                           \
+                    ZSTD_DEBUG_PRINT(__VA_ARGS__);               \
+            }   }
+#  define DEBUGLOG(l, ...) {                                     \
+                if (l<=g_debuglevel) {                           \
+                    ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \
+                    ZSTD_DEBUG_PRINT(" \n");                     \
+            }   }
+#else
+#  define RAWLOG(l, ...)      {}    /* disabled */
+#  define DEBUGLOG(l, ...)    {}    /* disabled */
+#endif
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* DEBUG_H_12987983217 */
--- a/third_party/zstd/lib/common/entropy_common.c
+++ b/third_party/zstd/lib/common/entropy_common.c
@ -0,0 +1,341 @@
+// clang-format off
+/* ******************************************************************
+ * Common functions of New Generation Entropy library
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ *  You can contact the author at :
+ *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include "third_party/zstd/lib/common/mem.h"
+#include "third_party/zstd/lib/common/error_private.h"       /* ERR_*, ERROR */
+#define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+#include "third_party/zstd/lib/common/fse.h"
+#include "third_party/zstd/lib/common/huf.h"
+#include "third_party/zstd/lib/common/bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
+
+
+/*===   Version   ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+
+/*===   Error Management   ===*/
+unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+FORCE_INLINE_TEMPLATE
+size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                           const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    unsigned const maxSV1 = *maxSVPtr + 1;
+    int previous0 = 0;
+
+    if (hbSize < 8) {
+        /* This function only works when hbSize >= 8 */
+        char buffer[8] = {0};
+        ZSTD_memcpy(buffer, headerBuffer, hbSize);
+        {   size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr,
+                                                    buffer, sizeof(buffer));
+            if (FSE_isError(countSize)) return countSize;
+            if (countSize > hbSize) return ERROR(corruption_detected);
+            return countSize;
+    }   }
+    assert(hbSize >= 8);
+
+    /* init */
+    ZSTD_memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0]));   /* all symbols not present in NCount have a frequency of 0 */
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    for (;;) {
+        if (previous0) {
+            /* Count the number of repeats. Each time the
+             * 2-bit repeat code is 0b11 there is another
+             * repeat.
+             * Avoid UB by setting the high bit to 1.
+             */
+            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+            while (repeats >= 12) {
+                charnum += 3 * 12;
+                if (LIKELY(ip <= iend-7)) {
+                    ip += 3;
+                } else {
+                    bitCount -= (int)(8 * (iend - 7 - ip));
+                    bitCount &= 31;
+                    ip = iend - 4;
+                }
+                bitStream = MEM_readLE32(ip) >> bitCount;
+                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+            }
+            charnum += 3 * repeats;
+            bitStream >>= 2 * repeats;
+            bitCount += 2 * repeats;
+
+            /* Add the final repeat which isn't 0b11. */
+            assert((bitStream & 3) < 3);
+            charnum += bitStream & 3;
+            bitCount += 2;
+
+            /* This is an error, but break and return an error
+             * at the end, because returning out of a loop makes
+             * it harder for the compiler to optimize.
+             */
+            if (charnum >= maxSV1) break;
+
+            /* We don't need to set the normalized count to 0
+             * because we already memset the whole buffer to 0.
+             */
+
+            if (LIKELY(ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                assert((bitCount >> 3) <= 3); /* For first condition to work */
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                bitCount &= 31;
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> bitCount;
+        }
+        {
+            int const max = (2*threshold-1) - remaining;
+            int count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = bitStream & (threshold-1);
+                bitCount += nbBits-1;
+            } else {
+                count = bitStream & (2*threshold-1);
+                if (count >= threshold) count -= max;
+                bitCount += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            /* When it matters (small blocks), this is a
+             * predictable branch, because we don't use -1.
+             */
+            if (count >= 0) {
+                remaining -= count;
+            } else {
+                assert(count == -1);
+                remaining += count;
+            }
+            normalizedCounter[charnum++] = (short)count;
+            previous0 = !count;
+
+            assert(threshold > 1);
+            if (remaining < threshold) {
+                /* This branch can be folded into the
+                 * threshold update condition because we
+                 * know that threshold > 1.
+                 */
+                if (remaining <= 1) break;
+                nbBits = ZSTD_highbit32(remaining) + 1;
+                threshold = 1 << (nbBits - 1);
+            }
+            if (charnum >= maxSV1) break;
+
+            if (LIKELY(ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                bitCount &= 31;
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> bitCount;
+    }   }
+    if (remaining != 1) return ERROR(corruption_detected);
+    /* Only possible when there are too many zeros. */
+    if (charnum > maxSV1) return ERROR(maxSymbolValue_tooSmall);
+    if (bitCount > 32) return ERROR(corruption_detected);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    return ip-istart;
+}
+
+/* Avoids the FORCE_INLINE of the _body() function. */
+static size_t FSE_readNCount_body_default(
+        short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+        const void* headerBuffer, size_t hbSize)
+{
+    return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
+}
+
+#if DYNAMIC_BMI2
+BMI2_TARGET_ATTRIBUTE static size_t FSE_readNCount_body_bmi2(
+        short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+        const void* headerBuffer, size_t hbSize)
+{
+    return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
+}
+#endif
+
+size_t FSE_readNCount_bmi2(
+        short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+        const void* headerBuffer, size_t hbSize, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return FSE_readNCount_body_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
+    }
+#endif
+    (void)bmi2;
+    return FSE_readNCount_body_default(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
+}
+
+size_t FSE_readNCount(
+        short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+        const void* headerBuffer, size_t hbSize)
+{
+    return FSE_readNCount_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize, /* bmi2 */ 0);
+}
+
+
+/*! HUF_readStats() :
+    Read compact Huffman tree, saved by HUF_writeCTable().
+    `huffWeight` is destination buffer.
+    `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize)
+{
+    U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                   U32* nbSymbolsPtr, U32* tableLogPtr,
+                   const void* src, size_t srcSize,
+                   void* workSpace, size_t wkspSize,
+                   int bmi2)
+{
+    U32 weightTotal;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    /* ZSTD_memset(huffWeight, 0, hwSize);   *//* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128) {  /* special header */
+        oSize = iSize - 127;
+        iSize = ((oSize+1)/2);
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        if (oSize >= hwSize) return ERROR(corruption_detected);
+        ip += 1;
+        {   U32 n;
+            for (n=0; n<oSize; n+=2) {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }
+    else  {   /* header compressed with FSE (normal case) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        /* max (hwSize-1) values decoded, as last one is implied */
+        oSize = FSE_decompress_wksp_bmi2(huffWeight, hwSize-1, ip+1, iSize, 6, workSpace, wkspSize, bmi2);
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    ZSTD_memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+    weightTotal = 0;
+    {   U32 n; for (n=0; n<oSize; n++) {
+            if (huffWeight[n] > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+            rankStats[huffWeight[n]]++;
+            weightTotal += (1 << huffWeight[n]) >> 1;
+    }   }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
+        if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+        *tableLogPtr = tableLog;
+        /* determine last weight */
+        {   U32 const total = 1 << tableLog;
+            U32 const rest = total - weightTotal;
+            U32 const verif = 1 << ZSTD_highbit32(rest);
+            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
+            if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+            huffWeight[oSize] = (BYTE)lastWeight;
+            rankStats[lastWeight]++;
+    }   }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    return iSize+1;
+}
+
+/* Avoids the FORCE_INLINE of the _body() function. */
+static size_t HUF_readStats_body_default(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize,
+                     void* workSpace, size_t wkspSize)
+{
+    return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 0);
+}
+
+#if DYNAMIC_BMI2
+static BMI2_TARGET_ATTRIBUTE size_t HUF_readStats_body_bmi2(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize,
+                     void* workSpace, size_t wkspSize)
+{
+    return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 1);
+}
+#endif
+
+size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize,
+                     void* workSpace, size_t wkspSize,
+                     int flags)
+{
+#if DYNAMIC_BMI2
+    if (flags & HUF_flags_bmi2) {
+        return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+    }
+#endif
+    (void)flags;
+    return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+}
--- a/third_party/zstd/lib/common/error_private.c
+++ b/third_party/zstd/lib/common/error_private.c
@ -0,0 +1,64 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* The purpose of this file is to have a single list of error strings embedded in binary */
+
+#include "third_party/zstd/lib/common/error_private.h"
+
+const char* ERR_getErrorString(ERR_enum code)
+{
+#ifdef ZSTD_STRIP_ERROR_STRINGS
+    (void)code;
+    return "Error strings stripped";
+#else
+    static const char* const notErrorCode = "Unspecified error code";
+    switch( code )
+    {
+    case PREFIX(no_error): return "No error detected";
+    case PREFIX(GENERIC):  return "Error (generic)";
+    case PREFIX(prefix_unknown): return "Unknown frame descriptor";
+    case PREFIX(version_unsupported): return "Version not supported";
+    case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+    case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+    case PREFIX(corruption_detected): return "Data corruption detected";
+    case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
+    case PREFIX(parameter_unsupported): return "Unsupported parameter";
+    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
+    case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+    case PREFIX(init_missing): return "Context should be init first";
+    case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+    case PREFIX(workSpace_tooSmall): return "workSpace buffer is not large enough";
+    case PREFIX(stage_wrong): return "Operation not authorized at current processing stage";
+    case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+    case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+    case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
+    case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+    case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+    case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+    case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+    case PREFIX(srcSize_wrong): return "Src size is incorrect";
+    case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
+    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
+    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
+        /* following error codes are not stable and may be removed or changed in a future version */
+    case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+    case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+    case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+    case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
+    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
+    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
+    case PREFIX(maxCode):
+    default: return notErrorCode;
+    }
+#endif
+}
--- a/third_party/zstd/lib/common/error_private.h
+++ b/third_party/zstd/lib/common/error_private.h
@ -0,0 +1,160 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Dependencies
+******************************************/
+#include "third_party/zstd/zstd_errors.h"  /* enum list */
+#include "third_party/zstd/lib/common/compiler.h"
+#include "third_party/zstd/lib/common/debug.h"
+#include "third_party/zstd/lib/common/zstd_deps.h"       /* size_t */
+
+
+/* ****************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-****************************************
+*  Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+
+/*-****************************************
+*  Error codes handling
+******************************************/
+#undef ERROR   /* already defined on Visual Studio */
+#define ERROR(name) ZSTD_ERROR(name)
+#define ZSTD_ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+
+/* check and forward error code */
+#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
+
+
+/*-****************************************
+*  Error Strings
+******************************************/
+
+const char* ERR_getErrorString(ERR_enum code);   /* error_private.c */
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    return ERR_getErrorString(ERR_getErrorCode(code));
+}
+
+/**
+ * Ignore: this is an internal helper.
+ *
+ * This is a helper function to help force C99-correctness during compilation.
+ * Under strict compilation modes, variadic macro arguments can't be empty.
+ * However, variadic function arguments can be. Using a function therefore lets
+ * us statically check that at least one (string) argument was passed,
+ * independent of the compilation flags.
+ */
+static INLINE_KEYWORD UNUSED_ATTR
+void _force_has_format_string(const char *format, ...) {
+  (void)format;
+}
+
+/**
+ * Ignore: this is an internal helper.
+ *
+ * We want to force this function invocation to be syntactically correct, but
+ * we don't want to force runtime evaluation of its arguments.
+ */
+#define _FORCE_HAS_FORMAT_STRING(...) \
+  if (0) { \
+    _force_has_format_string(__VA_ARGS__); \
+  }
+
+#define ERR_QUOTE(str) #str
+
+/**
+ * Return the specified error if the condition evaluates to true.
+ *
+ * In debug modes, prints additional information.
+ * In order to do that (particularly, printing the conditional that failed),
+ * this can't just wrap RETURN_ERROR().
+ */
+#define RETURN_ERROR_IF(cond, err, ...) \
+  if (cond) { \
+    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
+           __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
+    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+    RAWLOG(3, ": " __VA_ARGS__); \
+    RAWLOG(3, "\n"); \
+    return ERROR(err); \
+  }
+
+/**
+ * Unconditionally return the specified error.
+ *
+ * In debug modes, prints additional information.
+ */
+#define RETURN_ERROR(err, ...) \
+  do { \
+    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+           __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \
+    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+    RAWLOG(3, ": " __VA_ARGS__); \
+    RAWLOG(3, "\n"); \
+    return ERROR(err); \
+  } while(0);
+
+/**
+ * If the provided expression evaluates to an error code, returns that error code.
+ *
+ * In debug modes, prints additional information.
+ */
+#define FORWARD_IF_ERROR(err, ...) \
+  do { \
+    size_t const err_code = (err); \
+    if (ERR_isError(err_code)) { \
+      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
+             __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
+      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+      RAWLOG(3, ": " __VA_ARGS__); \
+      RAWLOG(3, "\n"); \
+      return err_code; \
+    } \
+  } while(0);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
--- a/third_party/zstd/lib/common/fse.h
+++ b/third_party/zstd/lib/common/fse.h
@ -0,0 +1,640 @@
+// clang-format off
+/* ******************************************************************
+ * FSE : Finite State Entropy codec
+ * Public Prototypes declaration
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef FSE_H
+#define FSE_H
+
+
+/*-*****************************************
+*  Dependencies
+******************************************/
+#include "third_party/zstd/lib/common/zstd_deps.h"    /* size_t, ptrdiff_t */
+
+
+/*-*****************************************
+*  FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define FSE_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define FSE_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define FSE_PUBLIC_API
+#endif
+
+/*------   Version   ------*/
+#define FSE_VERSION_MAJOR    0
+#define FSE_VERSION_MINOR    9
+#define FSE_VERSION_RELEASE  0
+
+#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
+#define FSE_QUOTE(str) #str
+#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
+#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
+
+#define FSE_VERSION_NUMBER  (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /**< library version number; to be used when checking dll version */
+
+
+/*-*****************************************
+*  Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size);       /* maximum compressed size */
+
+/* Error Management */
+FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return value is an error code */
+FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[] (see hist.h)
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+/* *** COMPRESSION *** */
+
+/*! FSE_optimalTableLog():
+    dynamically downsize 'tableLog' when conditions are met.
+    It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+    @return : recommended tableLog (necessarily <= 'maxTableLog') */
+FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_normalizeCount():
+    normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
+    'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+    useLowProbCount is a boolean parameter which trades off compressed size for
+    faster header decoding. When it is set to 1, the compressed data will be slightly
+    smaller. And when it is set to 0, FSE_readNCount() and FSE_buildDTable() will be
+    faster. If you are compressing a small amount of data (< 2 KB) then useLowProbCount=0
+    is a good default, since header deserialization makes a big speed difference.
+    Otherwise, useLowProbCount=1 is a good default, since the speed difference is small.
+    @return : tableLog,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog,
+                    const unsigned* count, size_t srcSize, unsigned maxSymbolValue, unsigned useLowProbCount);
+
+/*! FSE_NCountWriteBound():
+    Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
+    Typically useful for allocation purpose. */
+FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_writeNCount():
+    Compactly save 'normalizedCounter' into 'buffer'.
+    @return : size of the compressed table,
+              or an errorCode, which can be tested using FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                                 const short* normalizedCounter,
+                                 unsigned maxSymbolValue, unsigned tableLog);
+
+/*! Constructor and Destructor of FSE_CTable.
+    Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+
+/*! FSE_buildCTable():
+    Builds `ct`, which must be already allocated, using FSE_createCTable().
+    @return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_compress_usingCTable():
+    Compress `src` using `ct` into `dst` which must be already allocated.
+    @return : size of compressed data (<= `dstCapacity`),
+              or 0 if compressed data could not fit into `dst`,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct);
+
+/*!
+Tutorial :
+----------
+The first step is to count all symbols. FSE_count() does this job very fast.
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
+FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+The next step is to normalize the frequencies.
+FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
+which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
+
+The result of FSE_normalizeCount() will be saved into a table,
+called 'normalizedCounter', which is a table of signed short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
+The return value is tableLog if everything proceeded as expected.
+It is 0 if there is a single symbol within distribution.
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
+For guaranteed success, buffer size must be at least FSE_headerBound().
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
+
+'normalizedCounter' can then be used to create the compression table 'CTable'.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
+You can then use FSE_buildCTable() to fill 'CTable'.
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
+
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header), necessarily <= `dstCapacity`.
+If it returns '0', compressed data could not fit into 'dst'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*! FSE_readNCount():
+    Read compactly saved 'normalizedCounter' from 'rBuffer'.
+    @return : size read from 'rBuffer',
+              or an errorCode, which can be tested using FSE_isError().
+              maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter,
+                           unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                           const void* rBuffer, size_t rBuffSize);
+
+/*! FSE_readNCount_bmi2():
+ * Same as FSE_readNCount() but pass bmi2=1 when your CPU supports BMI2 and 0 otherwise.
+ */
+FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
+                           unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                           const void* rBuffer, size_t rBuffSize, int bmi2);
+
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+#endif  /* FSE_H */
+
+#if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+#define FSE_H_FSE_STATIC_LINKING_ONLY
+
+/* *** Dependency *** */
+#include "third_party/zstd/lib/common/bitstream.h"
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) ((size) + ((size)>>7) + 4 /* fse states */ + sizeof(size_t) /* bitContainer */)
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<((maxTableLog)-1)) + (((maxSymbolValue)+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<(maxTableLog)))
+
+/* or use the size to malloc() space directly. Pay attention to alignment restrictions though */
+#define FSE_CTABLE_SIZE(maxTableLog, maxSymbolValue)   (FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(FSE_CTable))
+#define FSE_DTABLE_SIZE(maxTableLog)                   (FSE_DTABLE_SIZE_U32(maxTableLog) * sizeof(FSE_DTable))
+
+
+/* *****************************************
+ *  FSE advanced API
+ ***************************************** */
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSE_optimalTableLog(), which used `minus==2` */
+
+size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+/**< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` must be >= `FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)` of `unsigned`.
+ * See FSE_buildCTable_wksp() for breakdown of workspace usage.
+ */
+#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (((maxSymbolValue + 2) + (1ull << (tableLog)))/2 + sizeof(U64)/sizeof(U32) /* additional 8 bytes for potential table overwrite */)
+#define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(unsigned) * FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog))
+size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+#define FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) (sizeof(short) * (maxSymbolValue + 1) + (1ULL << maxTableLog) + 8)
+#define FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ((FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) + sizeof(unsigned) - 1) / sizeof(unsigned))
+FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+/**< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
+
+#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+#define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
+size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
+/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
+ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
+
+typedef enum {
+   FSE_repeat_none,  /**< Cannot use the previous table */
+   FSE_repeat_check, /**< Can use the previous table but it must be checked */
+   FSE_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
+ } FSE_repeat;
+
+/* *****************************************
+*  FSE symbol compression API
+*******************************************/
+/*!
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   Hence their body are included in next section.
+*/
+typedef struct {
+    ptrdiff_t   value;
+    const void* stateTable;
+    const void* symbolTT;
+    unsigned    stateLog;
+} FSE_CState_t;
+
+static void FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
+
+static void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned symbol);
+
+static void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* CStatePtr);
+
+/**<
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable    ct;         // Provided by FSE_buildCTable()
+BIT_CStream_t bitStream;  // bitStream tracking structure
+FSE_CState_t  state;      // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+    size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize);
+    FSE_initCState(&state, ct);
+
+Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
+You can then encode your input data, byte after byte.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+    FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+    BIT_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+    BIT_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+    FSE_flushState(&bitStream, &state);
+
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+    size_t size = BIT_closeCStream(&bitStream);
+*/
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct {
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+/**<
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+BIT_DStream_t DStream;    // Stream context
+FSE_DState_t  DState;     // State context. Multiple ones are possible
+FSE_DTable*   DTablePtr;  // Decoding table, provided by FSE_buildDTable()
+
+The first thing to do is to init the bitStream.
+    errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+    errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+    unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+    size_t bitField = BIT_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size depends on size_t).
+Refueling the register from memory is manually performed by the reload method.
+    endSignal = FSE_reloadDStream(&DStream);
+
+BIT_reloadDStream() result tells if there is still some more data to read from DStream.
+BIT_DStream_unfinished : there is still some data left into the DStream.
+BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+    BIT_reloadDStream(&DStream) >= BIT_DStream_completed
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+    BIT_endOfDStream(&DStream);
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
+    FSE_endOfDState(&DState);
+*/
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+typedef struct {
+    int deltaFindState;
+    U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
+{
+    const void* ptr = ct;
+    const U16* u16ptr = (const U16*) ptr;
+    const U32 tableLog = MEM_read16(ptr);
+    statePtr->value = (ptrdiff_t)1<<tableLog;
+    statePtr->stateTable = u16ptr+2;
+    statePtr->symbolTT = ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1);
+    statePtr->stateLog = tableLog;
+}
+
+
+/*! FSE_initCState2() :
+*   Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+*   uses the smallest state value possible, saving the cost of this symbol */
+MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+    FSE_initCState(statePtr, ct);
+    {   const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+        const U16* stateTable = (const U16*)(statePtr->stateTable);
+        U32 nbBitsOut  = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+        statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+        statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+    }
+}
+
+MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, unsigned symbol)
+{
+    FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+    const U16* const stateTable = (const U16*)(statePtr->stateTable);
+    U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+    BIT_addBits(bitC, statePtr->value, nbBitsOut);
+    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+{
+    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+    BIT_flushBits(bitC);
+}
+
+
+/* FSE_getMaxNbBits() :
+ * Approximate maximum cost of a symbol, in bits.
+ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    return (symbolTT[symbolValue].deltaNbBits + ((1<<16)-1)) >> 16;
+}
+
+/* FSE_bitCost() :
+ * Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_bitCost(const void* symbolTTPtr, U32 tableLog, U32 symbolValue, U32 accuracyLog)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    U32 const minNbBits = symbolTT[symbolValue].deltaNbBits >> 16;
+    U32 const threshold = (minNbBits+1) << 16;
+    assert(tableLog < 16);
+    assert(accuracyLog < 31-tableLog);  /* ensure enough room for renormalization double shift */
+    {   U32 const tableSize = 1 << tableLog;
+        U32 const deltaFromThreshold = threshold - (symbolTT[symbolValue].deltaNbBits + tableSize);
+        U32 const normalizedDeltaFromThreshold = (deltaFromThreshold << accuracyLog) >> tableLog;   /* linear interpolation (very approximate) */
+        U32 const bitMultiplier = 1 << accuracyLog;
+        assert(symbolTT[symbolValue].deltaNbBits + tableSize <= threshold);
+        assert(normalizedDeltaFromThreshold <= bitMultiplier);
+        return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold;
+    }
+}
+
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.newState + lowBits;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/*! FSE_decodeSymbolFast() :
+    unsafe, only works if no symbol has a probability > 50% */
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#ifndef FSE_MAX_MEMORY_USAGE
+#  define FSE_MAX_MEMORY_USAGE 14
+#endif
+#ifndef FSE_DEFAULT_MEMORY_USAGE
+#  define FSE_DEFAULT_MEMORY_USAGE 13
+#endif
+#if (FSE_DEFAULT_MEMORY_USAGE > FSE_MAX_MEMORY_USAGE)
+#  error "FSE_DEFAULT_MEMORY_USAGE must be <= FSE_MAX_MEMORY_USAGE"
+#endif
+
+/*!FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#ifndef FSE_MAX_SYMBOL_VALUE
+#  define FSE_MAX_SYMBOL_VALUE 255
+#endif
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#  error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSE_TABLESTEP(tableSize) (((tableSize)>>1) + ((tableSize)>>3) + 3)
+
+
+#endif /* FSE_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
--- a/third_party/zstd/lib/common/fse_decompress.c
+++ b/third_party/zstd/lib/common/fse_decompress.c
@ -0,0 +1,312 @@
+// clang-format off
+/* ******************************************************************
+ * FSE : Finite State Entropy decoder
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "third_party/zstd/lib/common/debug.h"      /* assert */
+#include "third_party/zstd/lib/common/bitstream.h"
+#include "third_party/zstd/lib/common/compiler.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "third_party/zstd/lib/common/fse.h"
+#include "third_party/zstd/lib/common/error_private.h"
+#define ZSTD_DEPS_NEED_MALLOC
+#include "third_party/zstd/lib/common/zstd_deps.h"
+#include "third_party/zstd/lib/common/bits.h"       /* ZSTD_highbit32 */
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
+    U16* symbolNext = (U16*)workSpace;
+    BYTE* spread = (BYTE*)(symbolNext + maxSymbolValue + 1);
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    if (FSE_BUILD_DTABLE_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(maxSymbolValue_tooLarge);
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    {   FSE_DTableHeader DTableH;
+        DTableH.tableLog = (U16)tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    if (highThreshold == tableSize - 1) {
+        size_t const tableMask = tableSize-1;
+        size_t const step = FSE_TABLESTEP(tableSize);
+        /* First lay down the symbols in order.
+         * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
+         * misses since small blocks generally have small table logs, so nearly
+         * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
+         * our buffer to handle the over-write.
+         */
+        {
+            U64 const add = 0x0101010101010101ull;
+            size_t pos = 0;
+            U64 sv = 0;
+            U32 s;
+            for (s=0; s<maxSV1; ++s, sv += add) {
+                int i;
+                int const n = normalizedCounter[s];
+                MEM_write64(spread + pos, sv);
+                for (i = 8; i < n; i += 8) {
+                    MEM_write64(spread + pos + i, sv);
+                }
+                pos += n;
+            }
+        }
+        /* Now we spread those positions across the table.
+         * The benefit of doing it in two stages is that we avoid the
+         * variable size inner loop, which caused lots of branch misses.
+         * Now we can run through all the positions without any branch misses.
+         * We unroll the loop twice, since that is what empirically worked best.
+         */
+        {
+            size_t position = 0;
+            size_t s;
+            size_t const unroll = 2;
+            assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
+            for (s = 0; s < (size_t)tableSize; s += unroll) {
+                size_t u;
+                for (u = 0; u < unroll; ++u) {
+                    size_t const uPosition = (position + (u * step)) & tableMask;
+                    tableDecode[uPosition].symbol = spread[s + u];
+                }
+                position = (position + (unroll * step)) & tableMask;
+            }
+            assert(position == 0);
+        }
+    } else {
+        U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+    }   }
+
+    return 0;
+}
+
+size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    return FSE_buildDTable_internal(dt, normalizedCounter, maxSymbolValue, tableLog, workSpace, wkspSize);
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+
+FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+
+    /* Init */
+    CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) & (op<olimit) ; op+=4) {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1) {
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state1);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state2);
+            break;
+        }
+
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state2);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state1);
+            break;
+    }   }
+
+    return op-ostart;
+}
+
+typedef struct {
+    short ncount[FSE_MAX_SYMBOL_VALUE + 1];
+    FSE_DTable dtable[1]; /* Dynamically sized */
+} FSE_DecompressWksp;
+
+
+FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+        void* dst, size_t dstCapacity,
+        const void* cSrc, size_t cSrcSize,
+        unsigned maxLog, void* workSpace, size_t wkspSize,
+        int bmi2)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
+
+    DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
+    if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC);
+
+    /* normal FSE decoding mode */
+    {
+        size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
+        if (FSE_isError(NCountLength)) return NCountLength;
+        if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+        assert(NCountLength <= cSrcSize);
+        ip += NCountLength;
+        cSrcSize -= NCountLength;
+    }
+
+    if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
+    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
+    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+    wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+
+    CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+
+    {
+        const void* ptr = wksp->dtable;
+        const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+        const U32 fastMode = DTableH->fastMode;
+
+        /* select fast mode (static) */
+        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1);
+        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0);
+    }
+}
+
+/* Avoids the FORCE_INLINE of the _body() function. */
+static size_t FSE_decompress_wksp_body_default(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+{
+    return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 0);
+}
+
+#if DYNAMIC_BMI2
+BMI2_TARGET_ATTRIBUTE static size_t FSE_decompress_wksp_body_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+{
+    return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 1);
+}
+#endif
+
+size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return FSE_decompress_wksp_body_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+    }
+#endif
+    (void)bmi2;
+    return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+}
+
+#endif   /* FSE_COMMONDEFS_ONLY */
--- a/third_party/zstd/lib/common/huf.h
+++ b/third_party/zstd/lib/common/huf.h
@ -0,0 +1,274 @@
+// clang-format off
+/* ******************************************************************
+ * huff0 huffman codec,
+ * part of Finite State Entropy library
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef HUF_H_298734234
+#define HUF_H_298734234
+
+/* *** Dependencies *** */
+#include "third_party/zstd/lib/common/zstd_deps.h"    /* size_t */
+#include "third_party/zstd/lib/common/mem.h"          /* U32 */
+#define FSE_STATIC_LINKING_ONLY
+#include "third_party/zstd/lib/common/fse.h"
+
+
+/* ***   Tool functions *** */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)   /**< maximum input size for a single block compressed with HUF_compress */
+size_t HUF_compressBound(size_t size);   /**< maximum compressed size (worst case) */
+
+/* Error Management */
+unsigned    HUF_isError(size_t code);       /**< tells if a return value is an error code */
+const char* HUF_getErrorName(size_t code);  /**< provides error code string (useful for debugging) */
+
+
+#define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
+#define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
+
+/* *** Constants *** */
+#define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
+#define HUF_TABLELOG_DEFAULT  11      /* default tableLog value when none specified */
+#define HUF_SYMBOLVALUE_MAX  255
+
+#define HUF_TABLELOG_ABSOLUTEMAX  12  /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+#  error "HUF_TABLELOG_MAX is too large !"
+#endif
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true when incompressible is pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's Compression Table */
+/* this is a private definition, just exposed for allocation and strict aliasing purpose. never EVER access its members directly */
+typedef size_t HUF_CElt;   /* consider it an incomplete type */
+#define HUF_CTABLE_SIZE_ST(maxSymbolValue)   ((maxSymbolValue)+2)   /* Use tables of size_t, for proper alignment */
+#define HUF_CTABLE_SIZE(maxSymbolValue)       (HUF_CTABLE_SIZE_ST(maxSymbolValue) * sizeof(size_t))
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+    HUF_CElt name[HUF_CTABLE_SIZE_ST(maxSymbolValue)] /* no final ; */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUF_DTable;
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
+#define HUF_CREATE_STATIC_DTABLEX1(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) }
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+
+/**
+ * Huffman flags bitset.
+ * For all flags, 0 is the default value.
+ */
+typedef enum {
+    /**
+     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
+     * Otherwise: Ignored.
+     */
+    HUF_flags_bmi2 = (1 << 0),
+    /**
+     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
+     * If unset: Use heuristic to find the table depth.
+     */
+    HUF_flags_optimalDepth = (1 << 1),
+    /**
+     * If set: If the previous table can encode the input, always reuse the previous table.
+     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
+     */
+    HUF_flags_preferRepeat = (1 << 2),
+    /**
+     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
+     * If unset: Always histogram the entire input.
+     */
+    HUF_flags_suspectUncompressible = (1 << 3),
+    /**
+     * If set: Don't use assembly implementations
+     * If unset: Allow using assembly implementations
+     */
+    HUF_flags_disableAsm = (1 << 4),
+    /**
+     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
+     * If unset: Use the fast decoding loop when possible.
+     */
+    HUF_flags_disableFast = (1 << 5)
+} HUF_flags_e;
+
+
+/* ****************************************
+ *  HUF detailed API
+ * ****************************************/
+#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
+
+/*! HUF_compress() does the following:
+ *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+ *  2. (optional) refine tableLog using HUF_optimalTableLog()
+ *  3. build Huffman table from count using HUF_buildCTable()
+ *  4. save Huffman table to memory buffer using HUF_writeCTable()
+ *  5. encode the data stream using HUF_compress4X_usingCTable()
+ *
+ *  The following API allows targeting specific sub-functions for advanced tasks.
+ *  For example, it's possible to compress several blocks using the same 'CTable',
+ *  or to save and regenerate 'CTable' using external methods.
+ */
+unsigned HUF_minTableLog(unsigned symbolCardinality);
+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
+ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
+size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+
+typedef enum {
+   HUF_repeat_none,  /**< Cannot use the previous table */
+   HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+   HUF_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
+ } HUF_repeat;
+
+/** HUF_compress4X_repeat() :
+ *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid.
+ *  If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,    /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+ */
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
+#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                       const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+                             void* workSpace, size_t wkspSize);
+
+/*! HUF_readStats() :
+ *  Read compact Huffman tree, saved by HUF_writeCTable().
+ * `huffWeight` is destination buffer.
+ * @return : size read from `src` , or an error Code .
+ *  Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
+                     U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize);
+
+/*! HUF_readStats_wksp() :
+ * Same as HUF_readStats() but takes an external workspace which must be
+ * 4-byte aligned and its size must be >= HUF_READ_STATS_WORKSPACE_SIZE.
+ * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+ */
+#define HUF_READ_STATS_WORKSPACE_SIZE_U32 FSE_DECOMPRESS_WKSP_SIZE_U32(6, HUF_TABLELOG_MAX-1)
+#define HUF_READ_STATS_WORKSPACE_SIZE (HUF_READ_STATS_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
+                          U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                          const void* src, size_t srcSize,
+                          void* workspace, size_t wkspSize,
+                          int flags);
+
+/** HUF_readCTable() :
+ *  Loading a CTable saved with HUF_writeCTable() */
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights);
+
+/** HUF_getNbBitsFromCTable() :
+ *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+ *  Note 1 : is not inlined, as HUF_CElt definition is private */
+U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
+
+/*
+ * HUF_decompress() does the following:
+ * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
+ * 2. build Huffman table from save, using HUF_readDTableX?()
+ * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable()
+ */
+
+/** HUF_selectDecoder() :
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
+ *  Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+
+/**
+ *  The minimum workspace size for the `workSpace` used in
+ *  HUF_readDTableX1_wksp() and HUF_readDTableX2_wksp().
+ *
+ *  The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
+ *  HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
+ *  Buffer overflow errors may potentially occur if code modifications result in
+ *  a required workspace size greater than that specified in the following
+ *  macro.
+ */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+
+/* ====================== */
+/* single stream variants */
+/* ====================== */
+
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+/** HUF_compress1X_repeat() :
+ *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid.
+ *  If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,   /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /**< double-symbols decoder */
+#endif
+
+/* BMI2 variants.
+ * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+ */
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+#endif
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+#endif
+
+#endif   /* HUF_H_298734234 */
+
+#if defined (__cplusplus)
+}
+#endif
--- a/third_party/zstd/lib/common/mem.h
+++ b/third_party/zstd/lib/common/mem.h
@ -0,0 +1,452 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+*  Dependencies
+******************************************/
+  /* size_t, ptrdiff_t */
+#include "third_party/zstd/lib/common/compiler.h"  /* __has_builtin */
+#include "third_party/zstd/lib/common/debug.h"  /* DEBUG_STATIC_ASSERT */
+#include "third_party/zstd/lib/common/zstd_deps.h"  /* ZSTD_memcpy */
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"  /* _byteswap_ulong */
+// MISSING #include <intrin.h>  /* _byteswap_* */
+#endif
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#  if defined(_AIX)
+#include "libc/fmt/conv.h"
+#include "libc/inttypes.h"
+#include "libc/limits.h"
+#include "libc/literal.h"
+#  else
+#include "libc/inttypes.h"
+#include "libc/limits.h"
+#include "libc/literal.h" /* intptr_t */
+#  endif
+  typedef   uint8_t BYTE;
+  typedef   uint8_t U8;
+  typedef    int8_t S8;
+  typedef  uint16_t U16;
+  typedef   int16_t S16;
+  typedef  uint32_t U32;
+  typedef   int32_t S32;
+  typedef  uint64_t U64;
+  typedef   int64_t S64;
+#else
+#include "libc/limits.h"
+#include "libc/sysv/consts/_posix.h"
+#include "libc/sysv/consts/iov.h"
+#include "libc/sysv/consts/limits.h"
+#include "libc/sysv/consts/xopen.h"
+#include "libc/thread/thread.h"
+#if CHAR_BIT != 8
+#  error "this implementation requires char to be exactly 8-bit type"
+#endif
+  typedef unsigned char      BYTE;
+  typedef unsigned char      U8;
+  typedef   signed char      S8;
+#if USHRT_MAX != 65535
+#  error "this implementation requires short to be exactly 16-bit type"
+#endif
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+#if UINT_MAX != 4294967295
+#  error "this implementation requires int to be exactly 32-bit type"
+#endif
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+/* note : there are no limits defined for long long type in C90.
+ * limits exist in C99, however, in such case, <stdint.h> is preferred */
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O API
+*****************************************************************/
+/*=== Static platform detection ===*/
+MEM_STATIC unsigned MEM_32bits(void);
+MEM_STATIC unsigned MEM_64bits(void);
+MEM_STATIC unsigned MEM_isLittleEndian(void);
+
+/*=== Native unaligned read/write ===*/
+MEM_STATIC U16 MEM_read16(const void* memPtr);
+MEM_STATIC U32 MEM_read32(const void* memPtr);
+MEM_STATIC U64 MEM_read64(const void* memPtr);
+MEM_STATIC size_t MEM_readST(const void* memPtr);
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value);
+MEM_STATIC void MEM_write32(void* memPtr, U32 value);
+MEM_STATIC void MEM_write64(void* memPtr, U64 value);
+
+/*=== Little endian unaligned read/write ===*/
+MEM_STATIC U16 MEM_readLE16(const void* memPtr);
+MEM_STATIC U32 MEM_readLE24(const void* memPtr);
+MEM_STATIC U32 MEM_readLE32(const void* memPtr);
+MEM_STATIC U64 MEM_readLE64(const void* memPtr);
+MEM_STATIC size_t MEM_readLEST(const void* memPtr);
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val);
+MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val);
+MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32);
+MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64);
+MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val);
+
+/*=== Big endian unaligned read/write ===*/
+MEM_STATIC U32 MEM_readBE32(const void* memPtr);
+MEM_STATIC U64 MEM_readBE64(const void* memPtr);
+MEM_STATIC size_t MEM_readBEST(const void* memPtr);
+
+MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32);
+MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64);
+MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val);
+
+/*=== Byteswap ===*/
+MEM_STATIC U32 MEM_swap32(U32 in);
+MEM_STATIC U64 MEM_swap64(U64 in);
+MEM_STATIC size_t MEM_swapST(size_t in);
+
+
+/*-**************************************************************
+*  Memory I/O Implementation
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
+ * Method 0 : always use `memcpy()`. Safe and portable.
+ * Method 1 : Use compiler extension to set unaligned access.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ * Default  : method 1 if supported, else method 0
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  ifdef __GNUC__
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+    return 1;
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return 0;
+#elif defined(__clang__) && __LITTLE_ENDIAN__
+    return 1;
+#elif defined(__clang__) && __BIG_ENDIAN__
+    return 0;
+#elif defined(_MSC_VER) && (_M_AMD64 || _M_IX86)
+    return 1;
+#elif defined(__DMC__) && defined(_M_IX86)
+    return 1;
+#else
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+#endif
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+MEM_STATIC size_t MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+typedef __attribute__((aligned(1))) U16 unalign16;
+typedef __attribute__((aligned(1))) U32 unalign32;
+typedef __attribute__((aligned(1))) U64 unalign64;
+typedef __attribute__((aligned(1))) size_t unalignArch;
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; }
+MEM_STATIC size_t MEM_readST(const void* ptr) { return *(const unalignArch*)ptr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(unalign32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(unalign64*)memPtr = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC size_t MEM_readST(const void* memPtr)
+{
+    size_t val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    ZSTD_memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write32(void* memPtr, U32 value)
+{
+    ZSTD_memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write64(void* memPtr, U64 value)
+{
+    ZSTD_memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32_fallback(U32 in)
+{
+    return  ((in << 24) & 0xff000000 ) |
+            ((in <<  8) & 0x00ff0000 ) |
+            ((in >>  8) & 0x0000ff00 ) |
+            ((in >> 24) & 0x000000ff );
+}
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_ulong(in);
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+  || (defined(__clang__) && __has_builtin(__builtin_bswap32))
+    return __builtin_bswap32(in);
+#else
+    return MEM_swap32_fallback(in);
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64_fallback(U64 in)
+{
+     return  ((in << 56) & 0xff00000000000000ULL) |
+            ((in << 40) & 0x00ff000000000000ULL) |
+            ((in << 24) & 0x0000ff0000000000ULL) |
+            ((in << 8)  & 0x000000ff00000000ULL) |
+            ((in >> 8)  & 0x00000000ff000000ULL) |
+            ((in >> 24) & 0x0000000000ff0000ULL) |
+            ((in >> 40) & 0x000000000000ff00ULL) |
+            ((in >> 56) & 0x00000000000000ffULL);
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_uint64(in);
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+  || (defined(__clang__) && __has_builtin(__builtin_bswap64))
+    return __builtin_bswap64(in);
+#else
+    return MEM_swap64_fallback(in);
+#endif
+}
+
+MEM_STATIC size_t MEM_swapST(size_t in)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_swap32((U32)in);
+    else
+        return (size_t)MEM_swap64((U64)in);
+}
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE24(const void* memPtr)
+{
+    return (U32)MEM_readLE16(memPtr) + ((U32)(((const BYTE*)memPtr)[2]) << 16);
+}
+
+MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val)
+{
+    MEM_writeLE16(memPtr, (U16)val);
+    ((BYTE*)memPtr)[2] = (BYTE)(val>>16);
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+        return MEM_swap32(MEM_read32(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, val32);
+    else
+        MEM_write32(memPtr, MEM_swap32(val32));
+}
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+        return MEM_swap64(MEM_read64(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, val64);
+    else
+        MEM_write64(memPtr, MEM_swap64(val64));
+}
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeLE32(memPtr, (U32)val);
+    else
+        MEM_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+MEM_STATIC U32 MEM_readBE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap32(MEM_read32(memPtr));
+    else
+        return MEM_read32(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, MEM_swap32(val32));
+    else
+        MEM_write32(memPtr, val32);
+}
+
+MEM_STATIC U64 MEM_readBE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap64(MEM_read64(memPtr));
+    else
+        return MEM_read64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, MEM_swap64(val64));
+    else
+        MEM_write64(memPtr, val64);
+}
+
+MEM_STATIC size_t MEM_readBEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readBE32(memPtr);
+    else
+        return (size_t)MEM_readBE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeBE32(memPtr, (U32)val);
+    else
+        MEM_writeBE64(memPtr, (U64)val);
+}
+
+/* code only tested on 32 and 64 bits systems */
+MEM_STATIC void MEM_check(void) { DEBUG_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); }
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
--- a/third_party/zstd/lib/common/pool.c
+++ b/third_party/zstd/lib/common/pool.c
@ -0,0 +1,372 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ======   Dependencies   ======= */
+#include "third_party/zstd/lib/common/allocations.h"  /* ZSTD_customCalloc, ZSTD_customFree */
+#include "third_party/zstd/lib/common/zstd_deps.h" /* size_t */
+#include "third_party/zstd/lib/common/debug.h"     /* assert */
+#include "third_party/zstd/lib/common/pool.h"
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+#ifdef ZSTD_MULTITHREAD
+
+#include "third_party/zstd/lib/common/threading.h"   /* pthread adaptation */
+
+/* A job is a function and an opaque argument */
+typedef struct POOL_job_s {
+    POOL_function function;
+    void *opaque;
+} POOL_job;
+
+struct POOL_ctx_s {
+    ZSTD_customMem customMem;
+    /* Keep track of the threads */
+    ZSTD_pthread_t* threads;
+    size_t threadCapacity;
+    size_t threadLimit;
+
+    /* The queue is a circular buffer */
+    POOL_job *queue;
+    size_t queueHead;
+    size_t queueTail;
+    size_t queueSize;
+
+    /* The number of threads working on jobs */
+    size_t numThreadsBusy;
+    /* Indicates if the queue is empty */
+    int queueEmpty;
+
+    /* The mutex protects the queue */
+    ZSTD_pthread_mutex_t queueMutex;
+    /* Condition variable for pushers to wait on when the queue is full */
+    ZSTD_pthread_cond_t queuePushCond;
+    /* Condition variables for poppers to wait on when the queue is empty */
+    ZSTD_pthread_cond_t queuePopCond;
+    /* Indicates if the queue is shutting down */
+    int shutdown;
+};
+
+/* POOL_thread() :
+ * Work thread for the thread pool.
+ * Waits for jobs and executes them.
+ * @returns : NULL on failure else non-null.
+ */
+static void* POOL_thread(void* opaque) {
+    POOL_ctx* const ctx = (POOL_ctx*)opaque;
+    if (!ctx) { return NULL; }
+    for (;;) {
+        /* Lock the mutex and wait for a non-empty queue or until shutdown */
+        ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+
+        while ( ctx->queueEmpty
+            || (ctx->numThreadsBusy >= ctx->threadLimit) ) {
+            if (ctx->shutdown) {
+                /* even if !queueEmpty, (possible if numThreadsBusy >= threadLimit),
+                 * a few threads will be shutdown while !queueEmpty,
+                 * but enough threads will remain active to finish the queue */
+                ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+                return opaque;
+            }
+            ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
+        }
+        /* Pop a job off the queue */
+        {   POOL_job const job = ctx->queue[ctx->queueHead];
+            ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize;
+            ctx->numThreadsBusy++;
+            ctx->queueEmpty = (ctx->queueHead == ctx->queueTail);
+            /* Unlock the mutex, signal a pusher, and run the job */
+            ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+
+            job.function(job.opaque);
+
+            /* If the intended queue size was 0, signal after finishing job */
+            ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+            ctx->numThreadsBusy--;
+            ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        }
+    }  /* for (;;) */
+    assert(0);  /* Unreachable */
+}
+
+/* ZSTD_createThreadPool() : public access point */
+POOL_ctx* ZSTD_createThreadPool(size_t numThreads) {
+    return POOL_create (numThreads, 0);
+}
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem)
+{
+    POOL_ctx* ctx;
+    /* Check parameters */
+    if (!numThreads) { return NULL; }
+    /* Allocate the context and zero initialize */
+    ctx = (POOL_ctx*)ZSTD_customCalloc(sizeof(POOL_ctx), customMem);
+    if (!ctx) { return NULL; }
+    /* Initialize the job queue.
+     * It needs one extra space since one space is wasted to differentiate
+     * empty and full queues.
+     */
+    ctx->queueSize = queueSize + 1;
+    ctx->queue = (POOL_job*)ZSTD_customCalloc(ctx->queueSize * sizeof(POOL_job), customMem);
+    ctx->queueHead = 0;
+    ctx->queueTail = 0;
+    ctx->numThreadsBusy = 0;
+    ctx->queueEmpty = 1;
+    {
+        int error = 0;
+        error |= ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL);
+        error |= ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL);
+        error |= ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL);
+        if (error) { POOL_free(ctx); return NULL; }
+    }
+    ctx->shutdown = 0;
+    /* Allocate space for the thread handles */
+    ctx->threads = (ZSTD_pthread_t*)ZSTD_customCalloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
+    ctx->threadCapacity = 0;
+    ctx->customMem = customMem;
+    /* Check for errors */
+    if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; }
+    /* Initialize the threads */
+    {   size_t i;
+        for (i = 0; i < numThreads; ++i) {
+            if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) {
+                ctx->threadCapacity = i;
+                POOL_free(ctx);
+                return NULL;
+        }   }
+        ctx->threadCapacity = numThreads;
+        ctx->threadLimit = numThreads;
+    }
+    return ctx;
+}
+
+/*! POOL_join() :
+    Shutdown the queue, wake any sleeping threads, and join all of the threads.
+*/
+static void POOL_join(POOL_ctx* ctx) {
+    /* Shut down the queue */
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    ctx->shutdown = 1;
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    /* Wake up sleeping threads */
+    ZSTD_pthread_cond_broadcast(&ctx->queuePushCond);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    /* Join all of the threads */
+    {   size_t i;
+        for (i = 0; i < ctx->threadCapacity; ++i) {
+            ZSTD_pthread_join(ctx->threads[i]);  /* note : could fail */
+    }   }
+}
+
+void POOL_free(POOL_ctx *ctx) {
+    if (!ctx) { return; }
+    POOL_join(ctx);
+    ZSTD_pthread_mutex_destroy(&ctx->queueMutex);
+    ZSTD_pthread_cond_destroy(&ctx->queuePushCond);
+    ZSTD_pthread_cond_destroy(&ctx->queuePopCond);
+    ZSTD_customFree(ctx->queue, ctx->customMem);
+    ZSTD_customFree(ctx->threads, ctx->customMem);
+    ZSTD_customFree(ctx, ctx->customMem);
+}
+
+/*! POOL_joinJobs() :
+ *  Waits for all queued jobs to finish executing.
+ */
+void POOL_joinJobs(POOL_ctx* ctx) {
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    while(!ctx->queueEmpty || ctx->numThreadsBusy > 0) {
+        ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+    }
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+}
+
+void ZSTD_freeThreadPool (ZSTD_threadPool* pool) {
+  POOL_free (pool);
+}
+
+size_t POOL_sizeof(const POOL_ctx* ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    return sizeof(*ctx)
+        + ctx->queueSize * sizeof(POOL_job)
+        + ctx->threadCapacity * sizeof(ZSTD_pthread_t);
+}
+
+
+/* @return : 0 on success, 1 on error */
+static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads)
+{
+    if (numThreads <= ctx->threadCapacity) {
+        if (!numThreads) return 1;
+        ctx->threadLimit = numThreads;
+        return 0;
+    }
+    /* numThreads > threadCapacity */
+    {   ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_customCalloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem);
+        if (!threadPool) return 1;
+        /* replace existing thread pool */
+        ZSTD_memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool));
+        ZSTD_customFree(ctx->threads, ctx->customMem);
+        ctx->threads = threadPool;
+        /* Initialize additional threads */
+        {   size_t threadId;
+            for (threadId = ctx->threadCapacity; threadId < numThreads; ++threadId) {
+                if (ZSTD_pthread_create(&threadPool[threadId], NULL, &POOL_thread, ctx)) {
+                    ctx->threadCapacity = threadId;
+                    return 1;
+            }   }
+    }   }
+    /* successfully expanded */
+    ctx->threadCapacity = numThreads;
+    ctx->threadLimit = numThreads;
+    return 0;
+}
+
+/* @return : 0 on success, 1 on error */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads)
+{
+    int result;
+    if (ctx==NULL) return 1;
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    result = POOL_resize_internal(ctx, numThreads);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return result;
+}
+
+/**
+ * Returns 1 if the queue is full and 0 otherwise.
+ *
+ * When queueSize is 1 (pool was created with an intended queueSize of 0),
+ * then a queue is empty if there is a thread free _and_ no job is waiting.
+ */
+static int isQueueFull(POOL_ctx const* ctx) {
+    if (ctx->queueSize > 1) {
+        return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize);
+    } else {
+        return (ctx->numThreadsBusy == ctx->threadLimit) ||
+               !ctx->queueEmpty;
+    }
+}
+
+
+static void
+POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque)
+{
+    POOL_job job;
+    job.function = function;
+    job.opaque = opaque;
+    assert(ctx != NULL);
+    if (ctx->shutdown) return;
+
+    ctx->queueEmpty = 0;
+    ctx->queue[ctx->queueTail] = job;
+    ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize;
+    ZSTD_pthread_cond_signal(&ctx->queuePopCond);
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    /* Wait until there is space in the queue for the new job */
+    while (isQueueFull(ctx) && (!ctx->shutdown)) {
+        ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+}
+
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    if (isQueueFull(ctx)) {
+        ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        return 0;
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return 1;
+}
+
+
+#else  /* ZSTD_MULTITHREAD  not defined */
+
+/* ========================== */
+/* No multi-threading support */
+/* ========================== */
+
+
+/* We don't need any data, but if it is empty, malloc() might return NULL. */
+struct POOL_ctx_s {
+    int dummy;
+};
+static POOL_ctx g_poolCtx;
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx*
+POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem)
+{
+    (void)numThreads;
+    (void)queueSize;
+    (void)customMem;
+    return &g_poolCtx;
+}
+
+void POOL_free(POOL_ctx* ctx) {
+    assert(!ctx || ctx == &g_poolCtx);
+    (void)ctx;
+}
+
+void POOL_joinJobs(POOL_ctx* ctx){
+    assert(!ctx || ctx == &g_poolCtx);
+    (void)ctx;
+}
+
+int POOL_resize(POOL_ctx* ctx, size_t numThreads) {
+    (void)ctx; (void)numThreads;
+    return 0;
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+}
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+    return 1;
+}
+
+size_t POOL_sizeof(const POOL_ctx* ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    assert(ctx == &g_poolCtx);
+    return sizeof(*ctx);
+}
+
+#endif  /* ZSTD_MULTITHREAD */
--- a/third_party/zstd/lib/common/pool.h
+++ b/third_party/zstd/lib/common/pool.h
@ -0,0 +1,92 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef POOL_H
+#define POOL_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+#include "third_party/zstd/lib/common/zstd_deps.h"
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_customMem */
+#include "third_party/zstd/zstd.h"
+#include "libc/assert.h"
+
+typedef struct POOL_ctx_s POOL_ctx;
+
+/*! POOL_create() :
+ *  Create a thread pool with at most `numThreads` threads.
+ * `numThreads` must be at least 1.
+ *  The maximum number of queued jobs before blocking is `queueSize`.
+ * @return : POOL_ctx pointer on success, else NULL.
+*/
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize);
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem);
+
+/*! POOL_free() :
+ *  Free a thread pool returned by POOL_create().
+ */
+void POOL_free(POOL_ctx* ctx);
+
+
+/*! POOL_joinJobs() :
+ *  Waits for all queued jobs to finish executing.
+ */
+void POOL_joinJobs(POOL_ctx* ctx);
+
+/*! POOL_resize() :
+ *  Expands or shrinks pool's number of threads.
+ *  This is more efficient than releasing + creating a new context,
+ *  since it tries to preserve and re-use existing threads.
+ * `numThreads` must be at least 1.
+ * @return : 0 when resize was successful,
+ *           !0 (typically 1) if there is an error.
+ *    note : only numThreads can be resized, queueSize remains unchanged.
+ */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads);
+
+/*! POOL_sizeof() :
+ * @return threadpool memory usage
+ *  note : compatible with NULL (returns 0 in this case)
+ */
+size_t POOL_sizeof(const POOL_ctx* ctx);
+
+/*! POOL_function :
+ *  The function type that can be added to a thread pool.
+ */
+typedef void (*POOL_function)(void*);
+
+/*! POOL_add() :
+ *  Add the job `function(opaque)` to the thread pool. `ctx` must be valid.
+ *  Possibly blocks until there is room in the queue.
+ *  Note : The function may be executed asynchronously,
+ *         therefore, `opaque` must live until function has been completed.
+ */
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+/*! POOL_tryAdd() :
+ *  Add the job `function(opaque)` to thread pool _if_ a queue slot is available.
+ *  Returns immediately even if not (does not block).
+ * @return : 1 if successful, 0 if not.
+ */
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif
--- a/third_party/zstd/lib/common/portability_macros.h
+++ b/third_party/zstd/lib/common/portability_macros.h
@ -0,0 +1,157 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_PORTABILITY_MACROS_H
+#define ZSTD_PORTABILITY_MACROS_H
+
+/**
+ * This header file contains macro definitions to support portability.
+ * This header is shared between C and ASM code, so it MUST only
+ * contain macro definitions. It MUST not contain any C code.
+ *
+ * This header ONLY defines macros to detect platforms/feature support.
+ *
+ */
+
+
+/* compat. with non-clang compilers */
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+
+/* compat. with non-clang compilers */
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+/* compat. with non-clang compilers */
+#ifndef __has_feature
+#  define __has_feature(x) 0
+#endif
+
+/* detects whether we are being compiled under msan */
+#ifndef ZSTD_MEMORY_SANITIZER
+#  if __has_feature(memory_sanitizer)
+#    define ZSTD_MEMORY_SANITIZER 1
+#  else
+#    define ZSTD_MEMORY_SANITIZER 0
+#  endif
+#endif
+
+/* detects whether we are being compiled under asan */
+#ifndef ZSTD_ADDRESS_SANITIZER
+#  if __has_feature(address_sanitizer)
+#    define ZSTD_ADDRESS_SANITIZER 1
+#  elif defined(__SANITIZE_ADDRESS__)
+#    define ZSTD_ADDRESS_SANITIZER 1
+#  else
+#    define ZSTD_ADDRESS_SANITIZER 0
+#  endif
+#endif
+
+/* detects whether we are being compiled under dfsan */
+#ifndef ZSTD_DATAFLOW_SANITIZER
+# if __has_feature(dataflow_sanitizer)
+#  define ZSTD_DATAFLOW_SANITIZER 1
+# else
+#  define ZSTD_DATAFLOW_SANITIZER 0
+# endif
+#endif
+
+/* Mark the internal assembly functions as hidden  */
+#ifdef __ELF__
+# define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
+#else
+# define ZSTD_HIDE_ASM_FUNCTION(func)
+#endif
+
+/* Enable runtime BMI2 dispatch based on the CPU.
+ * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
+ */
+#ifndef DYNAMIC_BMI2
+  #if ((defined(__clang__) && __has_attribute(__target__)) \
+      || (defined(__GNUC__) \
+          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
+      && (defined(__x86_64__) || defined(_M_X64)) \
+      && !defined(__BMI2__)
+  #  define DYNAMIC_BMI2 1
+  #else
+  #  define DYNAMIC_BMI2 0
+  #endif
+#endif
+
+/**
+ * Only enable assembly for GNUC compatible compilers,
+ * because other platforms may not support GAS assembly syntax.
+ *
+ * Only enable assembly for Linux / MacOS, other platforms may
+ * work, but they haven't been tested. This could likely be
+ * extended to BSD systems.
+ *
+ * Disable assembly when MSAN is enabled, because MSAN requires
+ * 100% of code to be instrumented to work.
+ */
+#if defined(__GNUC__)
+#  if defined(__linux__) || defined(__linux) || defined(__APPLE__)
+#    if ZSTD_MEMORY_SANITIZER
+#      define ZSTD_ASM_SUPPORTED 0
+#    elif ZSTD_DATAFLOW_SANITIZER
+#      define ZSTD_ASM_SUPPORTED 0
+#    else
+#      define ZSTD_ASM_SUPPORTED 1
+#    endif
+#  else
+#    define ZSTD_ASM_SUPPORTED 0
+#  endif
+#else
+#  define ZSTD_ASM_SUPPORTED 0
+#endif
+
+/**
+ * Determines whether we should enable assembly for x86-64
+ * with BMI2.
+ *
+ * Enable if all of the following conditions hold:
+ * - ASM hasn't been explicitly disabled by defining ZSTD_DISABLE_ASM
+ * - Assembly is supported
+ * - We are compiling for x86-64 and either:
+ *   - DYNAMIC_BMI2 is enabled
+ *   - BMI2 is supported at compile time
+ */
+#if !defined(ZSTD_DISABLE_ASM) &&                                 \
+    ZSTD_ASM_SUPPORTED &&                                         \
+    defined(__x86_64__) &&                                        \
+    (DYNAMIC_BMI2 || defined(__BMI2__))
+# define ZSTD_ENABLE_ASM_X86_64_BMI2 1
+#else
+# define ZSTD_ENABLE_ASM_X86_64_BMI2 0
+#endif
+
+/*
+ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
+ * assembly sources when CET is enabled.
+ *
+ * Additionally, any function that may be called indirectly must begin
+ * with ZSTD_CET_ENDBRANCH.
+ */
+#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
+    && defined(__has_include)
+# if __has_include(<cet.h>)
+// MISSING #include <cet.h>
+#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
+# endif
+#endif
+
+#ifndef ZSTD_CET_ENDBRANCH
+# define ZSTD_CET_ENDBRANCH
+#endif
+
+#endif /* ZSTD_PORTABILITY_MACROS_H */
--- a/third_party/zstd/lib/common/threading.c
+++ b/third_party/zstd/lib/common/threading.c
@ -0,0 +1,183 @@
+// clang-format off
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**
+ * This file will hold wrapper for systems, which do not support pthreads
+ */
+
+#include "third_party/zstd/lib/common/threading.h"
+
+/* create fake symbol to avoid empty translation unit warning */
+int g_ZSTD_threading_useless_symbol;
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper
+ */
+
+
+/* ===  Dependencies  === */
+// MISSING #include <process.h>
+#include "libc/errno.h"
+
+
+/* ===  Implementation  === */
+
+typedef struct {
+    void* (*start_routine)(void*);
+    void* arg;
+    int initialized;
+    ZSTD_pthread_cond_t initialized_cond;
+    ZSTD_pthread_mutex_t initialized_mutex;
+} ZSTD_thread_params_t;
+
+static unsigned __stdcall worker(void *arg)
+{
+    void* (*start_routine)(void*);
+    void* thread_arg;
+
+    /* Initialized thread_arg and start_routine and signal main thread that we don't need it
+     * to wait any longer.
+     */
+    {
+        ZSTD_thread_params_t*  thread_param = (ZSTD_thread_params_t*)arg;
+        thread_arg = thread_param->arg;
+        start_routine = thread_param->start_routine;
+
+        /* Signal main thread that we are running and do not depend on its memory anymore */
+        ZSTD_pthread_mutex_lock(&thread_param->initialized_mutex);
+        thread_param->initialized = 1;
+        ZSTD_pthread_cond_signal(&thread_param->initialized_cond);
+        ZSTD_pthread_mutex_unlock(&thread_param->initialized_mutex);
+    }
+
+    start_routine(thread_arg);
+
+    return 0;
+}
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+            void* (*start_routine) (void*), void* arg)
+{
+    ZSTD_thread_params_t thread_param;
+    (void)unused;
+
+    if (thread==NULL) return -1;
+    *thread = NULL;
+
+    thread_param.start_routine = start_routine;
+    thread_param.arg = arg;
+    thread_param.initialized = 0;
+
+    /* Setup thread initialization synchronization */
+    if(ZSTD_pthread_cond_init(&thread_param.initialized_cond, NULL)) {
+        /* Should never happen on Windows */
+        return -1;
+    }
+    if(ZSTD_pthread_mutex_init(&thread_param.initialized_mutex, NULL)) {
+        /* Should never happen on Windows */
+        ZSTD_pthread_cond_destroy(&thread_param.initialized_cond);
+        return -1;
+    }
+
+    /* Spawn thread */
+    *thread = (HANDLE)_beginthreadex(NULL, 0, worker, &thread_param, 0, NULL);
+    if (*thread==NULL) {
+        ZSTD_pthread_mutex_destroy(&thread_param.initialized_mutex);
+        ZSTD_pthread_cond_destroy(&thread_param.initialized_cond);
+        return errno;
+    }
+
+    /* Wait for thread to be initialized */
+    ZSTD_pthread_mutex_lock(&thread_param.initialized_mutex);
+    while(!thread_param.initialized) {
+        ZSTD_pthread_cond_wait(&thread_param.initialized_cond, &thread_param.initialized_mutex);
+    }
+    ZSTD_pthread_mutex_unlock(&thread_param.initialized_mutex);
+    ZSTD_pthread_mutex_destroy(&thread_param.initialized_mutex);
+    ZSTD_pthread_cond_destroy(&thread_param.initialized_cond);
+
+    return 0;
+}
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread)
+{
+    DWORD result;
+
+    if (!thread) return 0;
+
+    result = WaitForSingleObject(thread, INFINITE);
+    CloseHandle(thread);
+
+    switch (result) {
+    case WAIT_OBJECT_0:
+        return 0;
+    case WAIT_ABANDONED:
+        return EINVAL;
+    default:
+        return GetLastError();
+    }
+}
+
+#endif   /* ZSTD_MULTITHREAD */
+
+#if defined(ZSTD_MULTITHREAD) && DEBUGLEVEL >= 1 && !defined(_WIN32)
+
+#define ZSTD_DEPS_NEED_MALLOC
+#include "third_party/zstd/lib/common/zstd_deps.h"
+
+int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr)
+{
+    assert(mutex != NULL);
+    *mutex = (pthread_mutex_t*)ZSTD_malloc(sizeof(pthread_mutex_t));
+    if (!*mutex)
+        return 1;
+    return pthread_mutex_init(*mutex, attr);
+}
+
+int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex)
+{
+    assert(mutex != NULL);
+    if (!*mutex)
+        return 0;
+    {
+        int const ret = pthread_mutex_destroy(*mutex);
+        ZSTD_free(*mutex);
+        return ret;
+    }
+}
+
+int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr)
+{
+    assert(cond != NULL);
+    *cond = (pthread_cond_t*)ZSTD_malloc(sizeof(pthread_cond_t));
+    if (!*cond)
+        return 1;
+    return pthread_cond_init(*cond, attr);
+}
+
+int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond)
+{
+    assert(cond != NULL);
+    if (!*cond)
+        return 0;
+    {
+        int const ret = pthread_cond_destroy(*cond);
+        ZSTD_free(*cond);
+        return ret;
+    }
+}
+
+#endif
--- a/third_party/zstd/lib/common/threading.h
+++ b/third_party/zstd/lib/common/threading.h
@ -0,0 +1,172 @@
+// clang-format off
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef THREADING_H_938743
+#define THREADING_H_938743
+
+#include "third_party/zstd/lib/common/debug.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper
+ */
+#ifdef WINVER
+#  undef WINVER
+#endif
+#define WINVER       0x0600
+
+#ifdef _WIN32_WINNT
+#  undef _WIN32_WINNT
+#endif
+#define _WIN32_WINNT 0x0600
+
+#ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+#endif
+
+#undef ERROR   /* reported already defined on VS 2015 (Rich Geldreich) */
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#undef ERROR
+#define ERROR(name) ZSTD_ERROR(name)
+
+
+/* mutex */
+#define ZSTD_pthread_mutex_t           CRITICAL_SECTION
+#define ZSTD_pthread_mutex_init(a, b)  ((void)(b), InitializeCriticalSection((a)), 0)
+#define ZSTD_pthread_mutex_destroy(a)  DeleteCriticalSection((a))
+#define ZSTD_pthread_mutex_lock(a)     EnterCriticalSection((a))
+#define ZSTD_pthread_mutex_unlock(a)   LeaveCriticalSection((a))
+
+/* condition variable */
+#define ZSTD_pthread_cond_t             CONDITION_VARIABLE
+#define ZSTD_pthread_cond_init(a, b)    ((void)(b), InitializeConditionVariable((a)), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    SleepConditionVariableCS((a), (b), INFINITE)
+#define ZSTD_pthread_cond_signal(a)     WakeConditionVariable((a))
+#define ZSTD_pthread_cond_broadcast(a)  WakeAllConditionVariable((a))
+
+/* ZSTD_pthread_create() and ZSTD_pthread_join() */
+typedef HANDLE ZSTD_pthread_t;
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+                   void* (*start_routine) (void*), void* arg);
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread);
+
+/**
+ * add here more wrappers as required
+ */
+
+
+#elif defined(ZSTD_MULTITHREAD)    /* posix assumed ; need a better detection method */
+/* ===   POSIX Systems   === */
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/thread2.h"
+
+#if DEBUGLEVEL < 1
+
+#define ZSTD_pthread_mutex_t            pthread_mutex_t
+#define ZSTD_pthread_mutex_init(a, b)   pthread_mutex_init((a), (b))
+#define ZSTD_pthread_mutex_destroy(a)   pthread_mutex_destroy((a))
+#define ZSTD_pthread_mutex_lock(a)      pthread_mutex_lock((a))
+#define ZSTD_pthread_mutex_unlock(a)    pthread_mutex_unlock((a))
+
+#define ZSTD_pthread_cond_t             pthread_cond_t
+#define ZSTD_pthread_cond_init(a, b)    pthread_cond_init((a), (b))
+#define ZSTD_pthread_cond_destroy(a)    pthread_cond_destroy((a))
+#define ZSTD_pthread_cond_wait(a, b)    pthread_cond_wait((a), (b))
+#define ZSTD_pthread_cond_signal(a)     pthread_cond_signal((a))
+#define ZSTD_pthread_cond_broadcast(a)  pthread_cond_broadcast((a))
+
+#define ZSTD_pthread_t                  pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a)         pthread_join((a),NULL)
+
+#else /* DEBUGLEVEL >= 1 */
+
+/* Debug implementation of threading.
+ * In this implementation we use pointers for mutexes and condition variables.
+ * This way, if we forget to init/destroy them the program will crash or ASAN
+ * will report leaks.
+ */
+
+#define ZSTD_pthread_mutex_t            pthread_mutex_t*
+int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr);
+int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex);
+#define ZSTD_pthread_mutex_lock(a)      pthread_mutex_lock(*(a))
+#define ZSTD_pthread_mutex_unlock(a)    pthread_mutex_unlock(*(a))
+
+#define ZSTD_pthread_cond_t             pthread_cond_t*
+int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr);
+int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond);
+#define ZSTD_pthread_cond_wait(a, b)    pthread_cond_wait(*(a), *(b))
+#define ZSTD_pthread_cond_signal(a)     pthread_cond_signal(*(a))
+#define ZSTD_pthread_cond_broadcast(a)  pthread_cond_broadcast(*(a))
+
+#define ZSTD_pthread_t                  pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a)         pthread_join((a),NULL)
+
+#endif
+
+#else  /* ZSTD_MULTITHREAD not defined */
+/* No multithreading support */
+
+typedef int ZSTD_pthread_mutex_t;
+#define ZSTD_pthread_mutex_init(a, b)   ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_mutex_destroy(a)   ((void)(a))
+#define ZSTD_pthread_mutex_lock(a)      ((void)(a))
+#define ZSTD_pthread_mutex_unlock(a)    ((void)(a))
+
+typedef int ZSTD_pthread_cond_t;
+#define ZSTD_pthread_cond_init(a, b)    ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    ((void)(a), (void)(b))
+#define ZSTD_pthread_cond_signal(a)     ((void)(a))
+#define ZSTD_pthread_cond_broadcast(a)  ((void)(a))
+
+/* do not use ZSTD_pthread_t */
+
+#endif /* ZSTD_MULTITHREAD */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* THREADING_H_938743 */
--- a/third_party/zstd/lib/common/xxhash.c
+++ b/third_party/zstd/lib/common/xxhash.c
@ -0,0 +1,25 @@
+// clang-format off
+/*
+ *  xxHash - Fast Hash algorithm
+ *  Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ *  You can contact the author at :
+ *  - xxHash homepage: https://cyan4973.github.io/xxHash/
+ *  - xxHash source repository : https://github.com/Cyan4973/xxHash
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+*/
+
+
+
+/*
+ * xxhash.c instantiates functions defined in xxhash.h
+ */
+
+#define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
+#define XXH_IMPLEMENTATION   /* access definitions */
+
+#include "third_party/zstd/lib/common/xxhash.h"
--- a/third_party/zstd/lib/common/xxhash.h
+++ b/third_party/zstd/lib/common/xxhash.h
--- a/third_party/zstd/lib/common/zstd_common.c
+++ b/third_party/zstd/lib/common/zstd_common.c
@ -0,0 +1,49 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#define ZSTD_DEPS_NEED_MALLOC
+#include "third_party/zstd/lib/common/error_private.h"
+#include "third_party/zstd/lib/common/zstd_internal.h"
+
+
+/*-****************************************
+*  Version
+******************************************/
+unsigned ZSTD_versionNumber(void) { return ZSTD_VERSION_NUMBER; }
+
+const char* ZSTD_versionString(void) { return ZSTD_VERSION_STRING; }
+
+
+/*-****************************************
+*  ZSTD Error Management
+******************************************/
+#undef ZSTD_isError   /* defined within zstd_internal.h */
+/*! ZSTD_isError() :
+ *  tells if a return value is an error code
+ *  symbol is required for external callers */
+unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+/*! ZSTD_getErrorName() :
+ *  provides error code string from function result (useful for debugging) */
+const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+/*! ZSTD_getError() :
+ *  convert a `size_t` function result into a proper ZSTD_errorCode enum */
+ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+
+/*! ZSTD_getErrorString() :
+ *  provides error code string from enum */
+const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
--- a/third_party/zstd/lib/common/zstd_deps.h
+++ b/third_party/zstd/lib/common/zstd_deps.h
@ -0,0 +1,142 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This file provides common libc dependencies that zstd requires.
+ * The purpose is to allow replacing this file with a custom implementation
+ * to compile zstd without libc support.
+ */
+
+/* Need:
+ * NULL
+ * INT_MAX
+ * UINT_MAX
+ * ZSTD_memcpy()
+ * ZSTD_memset()
+ * ZSTD_memmove()
+ */
+#ifndef ZSTD_DEPS_COMMON
+#define ZSTD_DEPS_COMMON
+
+#include "libc/limits.h"
+#include "libc/sysv/consts/_posix.h"
+#include "libc/sysv/consts/iov.h"
+#include "libc/sysv/consts/limits.h"
+#include "libc/sysv/consts/xopen.h"
+#include "libc/thread/thread.h"
+
+#include "libc/mem/alg.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h"
+
+#if defined(__GNUC__) && __GNUC__ >= 4
+# define ZSTD_memcpy(d,s,l) __builtin_memcpy((d),(s),(l))
+# define ZSTD_memmove(d,s,l) __builtin_memmove((d),(s),(l))
+# define ZSTD_memset(p,v,l) __builtin_memset((p),(v),(l))
+#else
+# define ZSTD_memcpy(d,s,l) memcpy((d),(s),(l))
+# define ZSTD_memmove(d,s,l) memmove((d),(s),(l))
+# define ZSTD_memset(p,v,l) memset((p),(v),(l))
+#endif
+
+#endif /* ZSTD_DEPS_COMMON */
+
+/* Need:
+ * ZSTD_malloc()
+ * ZSTD_free()
+ * ZSTD_calloc()
+ */
+#ifdef ZSTD_DEPS_NEED_MALLOC
+#ifndef ZSTD_DEPS_MALLOC
+#define ZSTD_DEPS_MALLOC
+
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"
+
+#define ZSTD_malloc(s) malloc(s)
+#define ZSTD_calloc(n,s) calloc((n), (s))
+#define ZSTD_free(p) free((p))
+
+#endif /* ZSTD_DEPS_MALLOC */
+#endif /* ZSTD_DEPS_NEED_MALLOC */
+
+/*
+ * Provides 64-bit math support.
+ * Need:
+ * U64 ZSTD_div64(U64 dividend, U32 divisor)
+ */
+#ifdef ZSTD_DEPS_NEED_MATH64
+#ifndef ZSTD_DEPS_MATH64
+#define ZSTD_DEPS_MATH64
+
+#define ZSTD_div64(dividend, divisor) ((dividend) / (divisor))
+
+#endif /* ZSTD_DEPS_MATH64 */
+#endif /* ZSTD_DEPS_NEED_MATH64 */
+
+/* Need:
+ * assert()
+ */
+#ifdef ZSTD_DEPS_NEED_ASSERT
+#ifndef ZSTD_DEPS_ASSERT
+#define ZSTD_DEPS_ASSERT
+
+#include "libc/assert.h"
+
+#endif /* ZSTD_DEPS_ASSERT */
+#endif /* ZSTD_DEPS_NEED_ASSERT */
+
+/* Need:
+ * ZSTD_DEBUG_PRINT()
+ */
+#ifdef ZSTD_DEPS_NEED_IO
+#ifndef ZSTD_DEPS_IO
+#define ZSTD_DEPS_IO
+
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+#define ZSTD_DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
+
+#endif /* ZSTD_DEPS_IO */
+#endif /* ZSTD_DEPS_NEED_IO */
+
+/* Only requested when <stdint.h> is known to be present.
+ * Need:
+ * intptr_t
+ */
+#ifdef ZSTD_DEPS_NEED_STDINT
+#ifndef ZSTD_DEPS_STDINT
+#define ZSTD_DEPS_STDINT
+
+#include "libc/inttypes.h"
+#include "libc/limits.h"
+#include "libc/literal.h"
+
+#endif /* ZSTD_DEPS_STDINT */
+#endif /* ZSTD_DEPS_NEED_STDINT */
--- a/third_party/zstd/lib/common/zstd_internal.h
+++ b/third_party/zstd/lib/common/zstd_internal.h
@ -0,0 +1,393 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/* this module contains definitions which must be identical
+ * across compression, decompression and dictBuilder.
+ * It also contains a few functions useful to at least 2 of them
+ * and which benefit from being inlined */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "third_party/zstd/lib/common/compiler.h"
+#include "third_party/zstd/lib/common/cpu.h"
+#include "third_party/zstd/lib/common/mem.h"
+#include "third_party/zstd/lib/common/debug.h"                 /* assert, DEBUGLOG, RAWLOG, g_debuglevel */
+#include "third_party/zstd/lib/common/error_private.h"
+#define ZSTD_STATIC_LINKING_ONLY
+#include "third_party/zstd/zstd.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "third_party/zstd/lib/common/fse.h"
+#include "third_party/zstd/lib/common/huf.h"
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY  /* XXH64_state_t */
+#endif
+#include "third_party/zstd/lib/common/xxhash.h"                /* XXH_reset, update, digest */
+#ifndef ZSTD_NO_TRACE
+#include "third_party/zstd/lib/common/zstd_trace.h"
+#else
+#  define ZSTD_TRACE 0
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ---- static assert (debug) --- */
+#define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)
+#define ZSTD_isError ERR_isError   /* for inlining */
+#define FSE_isError  ERR_isError
+#define HUF_isError  ERR_isError
+
+
+/*-*************************************
+*  shared macros
+***************************************/
+#undef MIN
+#undef MAX
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+#define BOUNDED(min,val,max) (MAX(min,MIN(val,max)))
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTD_OPT_NUM    (1<<12)
+
+#define ZSTD_REP_NUM      3                 /* number of repcodes */
+static UNUSED_ATTR const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 };
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
+static UNUSED_ATTR const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
+static UNUSED_ATTR const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };
+
+#define ZSTD_FRAMEIDSIZE 4   /* magic number size */
+
+#define ZSTD_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static UNUSED_ATTR const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
+typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+
+#define ZSTD_FRAMECHECKSUMSIZE 4
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
+#define MIN_LITERALS_FOR_4_STREAMS 6
+
+typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+
+#define Litbits  8
+#define LitHufLog 11
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML   52
+#define MaxLL   35
+#define DefaultMaxOff 28
+#define MaxOff  31
+#define MaxSeq MAX(MaxLL, MaxML)   /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog    9
+#define LLFSELog    9
+#define OffFSELog   8
+#define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
+#define MaxMLBits 16
+#define MaxLLBits 16
+
+#define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
+/* Each table cannot take more than #symbols * FSELog bits */
+#define ZSTD_MAX_FSE_HEADERS_SIZE (((MaxML + 1) * MLFSELog + (MaxLL + 1) * LLFSELog + (MaxOff + 1) * OffFSELog + 7) / 8)
+
+static UNUSED_ATTR const U8 LL_bits[MaxLL+1] = {
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     1, 1, 1, 1, 2, 2, 3, 3,
+     4, 6, 7, 8, 9,10,11,12,
+    13,14,15,16
+};
+static UNUSED_ATTR const S16 LL_defaultNorm[MaxLL+1] = {
+     4, 3, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 1, 1, 1,
+     2, 2, 2, 2, 2, 2, 2, 2,
+     2, 3, 2, 1, 1, 1, 1, 1,
+    -1,-1,-1,-1
+};
+#define LL_DEFAULTNORMLOG 6  /* for static allocation */
+static UNUSED_ATTR const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
+
+static UNUSED_ATTR const U8 ML_bits[MaxML+1] = {
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     1, 1, 1, 1, 2, 2, 3, 3,
+     4, 4, 5, 7, 8, 9,10,11,
+    12,13,14,15,16
+};
+static UNUSED_ATTR const S16 ML_defaultNorm[MaxML+1] = {
+     1, 4, 3, 2, 2, 2, 2, 2,
+     2, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1,-1,-1,
+    -1,-1,-1,-1,-1
+};
+#define ML_DEFAULTNORMLOG 6  /* for static allocation */
+static UNUSED_ATTR const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
+
+static UNUSED_ATTR const S16 OF_defaultNorm[DefaultMaxOff+1] = {
+     1, 1, 1, 1, 1, 1, 2, 2,
+     2, 1, 1, 1, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 1, 1, 1,
+    -1,-1,-1,-1,-1
+};
+#define OF_DEFAULTNORMLOG 5  /* for static allocation */
+static UNUSED_ATTR const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTD_copy8(void* dst, const void* src) {
+#if defined(ZSTD_ARCH_ARM_NEON)
+    vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
+#else
+    ZSTD_memcpy(dst, src, 8);
+#endif
+}
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+/* Need to use memmove here since the literal buffer can now be located within
+   the dst buffer. In circumstances where the op "catches up" to where the
+   literal buffer is, there can be partial overlaps in this call on the final
+   copy if the literal is being shifted by less than 16 bytes. */
+static void ZSTD_copy16(void* dst, const void* src) {
+#if defined(ZSTD_ARCH_ARM_NEON)
+    vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
+#elif defined(ZSTD_ARCH_X86_SSE2)
+    _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src));
+#elif defined(__clang__)
+    ZSTD_memmove(dst, src, 16);
+#else
+    /* ZSTD_memmove is not inlined properly by gcc */
+    BYTE copy16_buf[16];
+    ZSTD_memcpy(copy16_buf, src, 16);
+    ZSTD_memcpy(dst, copy16_buf, 16);
+#endif
+}
+#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
+
+#define WILDCOPY_OVERLENGTH 32
+#define WILDCOPY_VECLEN 16
+
+typedef enum {
+    ZSTD_no_overlap,
+    ZSTD_overlap_src_before_dst
+    /*  ZSTD_overlap_dst_before_src, */
+} ZSTD_overlap_e;
+
+/*! ZSTD_wildcopy() :
+ *  Custom version of ZSTD_memcpy(), can over read/write up to WILDCOPY_OVERLENGTH bytes (if length==0)
+ *  @param ovtype controls the overlap detection
+ *         - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
+ *         - ZSTD_overlap_src_before_dst: The src and dst may overlap, but they MUST be at least 8 bytes apart.
+ *           The src buffer must be before the dst buffer.
+ */
+MEM_STATIC FORCE_INLINE_ATTR
+void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e const ovtype)
+{
+    ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+
+    if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
+        /* Handle short offset copies. */
+        do {
+            COPY8(op, ip)
+        } while (op < oend);
+    } else {
+        assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
+        /* Separate out the first COPY16() call because the copy length is
+         * almost certain to be short, so the branches have different
+         * probabilities. Since it is almost certain to be short, only do
+         * one COPY16() in the first call. Then, do two calls per loop since
+         * at that point it is more likely to have a high trip count.
+         */
+        ZSTD_copy16(op, ip);
+        if (16 >= length) return;
+        op += 16;
+        ip += 16;
+        do {
+            COPY16(op, ip);
+            COPY16(op, ip);
+        }
+        while (op < oend);
+    }
+}
+
+MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const length = MIN(dstCapacity, srcSize);
+    if (length > 0) {
+        ZSTD_memcpy(dst, src, length);
+    }
+    return length;
+}
+
+/* define "workspace is too large" as this number of times larger than needed */
+#define ZSTD_WORKSPACETOOLARGE_FACTOR 3
+
+/* when workspace is continuously too large
+ * during at least this number of times,
+ * context's memory usage is considered wasteful,
+ * because it's sized to handle a worst case scenario which rarely happens.
+ * In which case, resize it down to free some memory */
+#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128
+
+/* Controls whether the input/output buffer is buffered or stable. */
+typedef enum {
+    ZSTD_bm_buffered = 0,  /* Buffer the input/output */
+    ZSTD_bm_stable = 1     /* ZSTD_inBuffer/ZSTD_outBuffer is stable */
+} ZSTD_bufferMode_e;
+
+
+/*-*******************************************
+*  Private declarations
+*********************************************/
+typedef struct seqDef_s {
+    U32 offBase;   /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */
+    U16 litLength;
+    U16 mlBase;    /* mlBase == matchLength - MINMATCH */
+} seqDef;
+
+/* Controls whether seqStore has a single "long" litLength or matchLength. See seqStore_t. */
+typedef enum {
+    ZSTD_llt_none = 0,             /* no longLengthType */
+    ZSTD_llt_literalLength = 1,    /* represents a long literal */
+    ZSTD_llt_matchLength = 2       /* represents a long match */
+} ZSTD_longLengthType_e;
+
+typedef struct {
+    seqDef* sequencesStart;
+    seqDef* sequences;      /* ptr to end of sequences */
+    BYTE*  litStart;
+    BYTE*  lit;             /* ptr to end of literals */
+    BYTE*  llCode;
+    BYTE*  mlCode;
+    BYTE*  ofCode;
+    size_t maxNbSeq;
+    size_t maxNbLit;
+
+    /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength
+     * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
+     * the existing value of the litLength or matchLength by 0x10000.
+     */
+    ZSTD_longLengthType_e longLengthType;
+    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
+} seqStore_t;
+
+typedef struct {
+    U32 litLength;
+    U32 matchLength;
+} ZSTD_sequenceLength;
+
+/**
+ * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences
+ * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength.
+ */
+MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq)
+{
+    ZSTD_sequenceLength seqLen;
+    seqLen.litLength = seq->litLength;
+    seqLen.matchLength = seq->mlBase + MINMATCH;
+    if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+        if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+            seqLen.litLength += 0x10000;
+        }
+        if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+            seqLen.matchLength += 0x10000;
+        }
+    }
+    return seqLen;
+}
+
+/**
+ * Contains the compressed frame size and an upper-bound for the decompressed frame size.
+ * Note: before using `compressedSize`, check for errors using ZSTD_isError().
+ *       similarly, before using `decompressedBound`, check for errors using:
+ *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+ */
+typedef struct {
+    size_t nbBlocks;
+    size_t compressedSize;
+    unsigned long long decompressedBound;
+} ZSTD_frameSizeInfo;   /* decompress & legacy */
+
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx);   /* zstdmt, adaptive_compression (shouldn't get this definition from here) */
+
+
+typedef struct {
+    blockType_e blockType;
+    U32 lastBlock;
+    U32 origSize;
+} blockProperties_t;   /* declared here for decompress and fullbench */
+
+/*! ZSTD_getcBlockSize() :
+ *  Provides the size of compressed block from block header `src` */
+/*  Used by: decompress, fullbench */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                          blockProperties_t* bpPtr);
+
+/*! ZSTD_decodeSeqHeaders() :
+ *  decode sequence header from src */
+/*  Used by: zstd_decompress_block, fullbench */
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                       const void* src, size_t srcSize);
+
+/**
+ * @returns true iff the CPU supports dynamic BMI2 dispatch.
+ */
+MEM_STATIC int ZSTD_cpuSupportsBmi2(void)
+{
+    ZSTD_cpuid_t cpuid = ZSTD_cpuid();
+    return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid);
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTD_CCOMMON_H_MODULE */
--- a/third_party/zstd/lib/common/zstd_trace.h
+++ b/third_party/zstd/lib/common/zstd_trace.h
@ -0,0 +1,164 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_TRACE_H
+#define ZSTD_TRACE_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/* weak symbol support
+ * For now, enable conservatively:
+ * - Only GNUC
+ * - Only ELF
+ * - Only x86-64, i386 and aarch64
+ * Also, explicitly disable on platforms known not to work so they aren't
+ * forgotten in the future.
+ */
+#if !defined(ZSTD_HAVE_WEAK_SYMBOLS) && \
+    defined(__GNUC__) && defined(__ELF__) && \
+    (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) || defined(__aarch64__)) && \
+    !defined(__APPLE__) && !defined(_WIN32) && !defined(__MINGW32__) && \
+    !defined(__CYGWIN__) && !defined(_AIX)
+#  define ZSTD_HAVE_WEAK_SYMBOLS 1
+#else
+#  define ZSTD_HAVE_WEAK_SYMBOLS 0
+#endif
+#if ZSTD_HAVE_WEAK_SYMBOLS
+#  define ZSTD_WEAK_ATTR __attribute__((__weak__))
+#else
+#  define ZSTD_WEAK_ATTR
+#endif
+
+/* Only enable tracing when weak symbols are available. */
+#ifndef ZSTD_TRACE
+#  define ZSTD_TRACE ZSTD_HAVE_WEAK_SYMBOLS
+#endif
+
+#if ZSTD_TRACE
+
+struct ZSTD_CCtx_s;
+struct ZSTD_DCtx_s;
+struct ZSTD_CCtx_params_s;
+
+typedef struct {
+    /**
+     * ZSTD_VERSION_NUMBER
+     *
+     * This is guaranteed to be the first member of ZSTD_trace.
+     * Otherwise, this struct is not stable between versions. If
+     * the version number does not match your expectation, you
+     * should not interpret the rest of the struct.
+     */
+    unsigned version;
+    /**
+     * Non-zero if streaming (de)compression is used.
+     */
+    unsigned streaming;
+    /**
+     * The dictionary ID.
+     */
+    unsigned dictionaryID;
+    /**
+     * Is the dictionary cold?
+     * Only set on decompression.
+     */
+    unsigned dictionaryIsCold;
+    /**
+     * The dictionary size or zero if no dictionary.
+     */
+    size_t dictionarySize;
+    /**
+     * The uncompressed size of the data.
+     */
+    size_t uncompressedSize;
+    /**
+     * The compressed size of the data.
+     */
+    size_t compressedSize;
+    /**
+     * The fully resolved CCtx parameters (NULL on decompression).
+     */
+    struct ZSTD_CCtx_params_s const* params;
+    /**
+     * The ZSTD_CCtx pointer (NULL on decompression).
+     */
+    struct ZSTD_CCtx_s const* cctx;
+    /**
+     * The ZSTD_DCtx pointer (NULL on compression).
+     */
+    struct ZSTD_DCtx_s const* dctx;
+} ZSTD_Trace;
+
+/**
+ * A tracing context. It must be 0 when tracing is disabled.
+ * Otherwise, any non-zero value returned by a tracing begin()
+ * function is presented to any subsequent calls to end().
+ *
+ * Any non-zero value is treated as tracing is enabled and not
+ * interpreted by the library.
+ *
+ * Two possible uses are:
+ * * A timestamp for when the begin() function was called.
+ * * A unique key identifying the (de)compression, like the
+ *   address of the [dc]ctx pointer if you need to track
+ *   more information than just a timestamp.
+ */
+typedef unsigned long long ZSTD_TraceCtx;
+
+/**
+ * Trace the beginning of a compression call.
+ * @param cctx The dctx pointer for the compression.
+ *             It can be used as a key to map begin() to end().
+ * @returns Non-zero if tracing is enabled. The return value is
+ *          passed to ZSTD_trace_compress_end().
+ */
+ZSTD_WEAK_ATTR ZSTD_TraceCtx ZSTD_trace_compress_begin(
+    struct ZSTD_CCtx_s const* cctx);
+
+/**
+ * Trace the end of a compression call.
+ * @param ctx The return value of ZSTD_trace_compress_begin().
+ * @param trace The zstd tracing info.
+ */
+ZSTD_WEAK_ATTR void ZSTD_trace_compress_end(
+    ZSTD_TraceCtx ctx,
+    ZSTD_Trace const* trace);
+
+/**
+ * Trace the beginning of a decompression call.
+ * @param dctx The dctx pointer for the decompression.
+ *             It can be used as a key to map begin() to end().
+ * @returns Non-zero if tracing is enabled. The return value is
+ *          passed to ZSTD_trace_compress_end().
+ */
+ZSTD_WEAK_ATTR ZSTD_TraceCtx ZSTD_trace_decompress_begin(
+    struct ZSTD_DCtx_s const* dctx);
+
+/**
+ * Trace the end of a decompression call.
+ * @param ctx The return value of ZSTD_trace_decompress_begin().
+ * @param trace The zstd tracing info.
+ */
+ZSTD_WEAK_ATTR void ZSTD_trace_decompress_end(
+    ZSTD_TraceCtx ctx,
+    ZSTD_Trace const* trace);
+
+#endif /* ZSTD_TRACE */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_TRACE_H */
--- a/third_party/zstd/lib/compress/clevels.h
+++ b/third_party/zstd/lib/compress/clevels.h
@ -0,0 +1,135 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CLEVELS_H
+#define ZSTD_CLEVELS_H
+
+#define ZSTD_STATIC_LINKING_ONLY  /* ZSTD_compressionParameters  */
+#include "third_party/zstd/zstd.h"
+
+/*-=====  Pre-defined compression levels  =====-*/
+
+#define ZSTD_MAX_CLEVEL     22
+
+#ifdef __GNUC__
+__attribute__((__unused__))
+#endif
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
+{   /* "default" - for any srcSize > 256 KB */
+    /* W,  C,  H,  S,  L, TL, strat */
+    { 19, 12, 13,  1,  6,  1, ZSTD_fast    },  /* base for negative levels */
+    { 19, 13, 14,  1,  7,  0, ZSTD_fast    },  /* level  1 */
+    { 20, 15, 16,  1,  6,  0, ZSTD_fast    },  /* level  2 */
+    { 21, 16, 17,  1,  5,  0, ZSTD_dfast   },  /* level  3 */
+    { 21, 18, 18,  1,  5,  0, ZSTD_dfast   },  /* level  4 */
+    { 21, 18, 19,  3,  5,  2, ZSTD_greedy  },  /* level  5 */
+    { 21, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6 */
+    { 21, 19, 20,  4,  5,  8, ZSTD_lazy    },  /* level  7 */
+    { 21, 19, 20,  4,  5, 16, ZSTD_lazy2   },  /* level  8 */
+    { 22, 20, 21,  4,  5, 16, ZSTD_lazy2   },  /* level  9 */
+    { 22, 21, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 10 */
+    { 22, 21, 22,  6,  5, 16, ZSTD_lazy2   },  /* level 11 */
+    { 22, 22, 23,  6,  5, 32, ZSTD_lazy2   },  /* level 12 */
+    { 22, 22, 22,  4,  5, 32, ZSTD_btlazy2 },  /* level 13 */
+    { 22, 22, 23,  5,  5, 32, ZSTD_btlazy2 },  /* level 14 */
+    { 22, 23, 23,  6,  5, 32, ZSTD_btlazy2 },  /* level 15 */
+    { 22, 22, 22,  5,  5, 48, ZSTD_btopt   },  /* level 16 */
+    { 23, 23, 22,  5,  4, 64, ZSTD_btopt   },  /* level 17 */
+    { 23, 23, 22,  6,  3, 64, ZSTD_btultra },  /* level 18 */
+    { 23, 24, 22,  7,  3,256, ZSTD_btultra2},  /* level 19 */
+    { 25, 25, 23,  7,  3,256, ZSTD_btultra2},  /* level 20 */
+    { 26, 26, 24,  7,  3,512, ZSTD_btultra2},  /* level 21 */
+    { 27, 27, 25,  9,  3,999, ZSTD_btultra2},  /* level 22 */
+},
+{   /* for srcSize <= 256 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 18, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 18, 13, 14,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 18, 14, 14,  1,  5,  0, ZSTD_dfast   },  /* level  2 */
+    { 18, 16, 16,  1,  4,  0, ZSTD_dfast   },  /* level  3 */
+    { 18, 16, 17,  3,  5,  2, ZSTD_greedy  },  /* level  4.*/
+    { 18, 17, 18,  5,  5,  2, ZSTD_greedy  },  /* level  5.*/
+    { 18, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6.*/
+    { 18, 18, 19,  4,  4,  4, ZSTD_lazy    },  /* level  7 */
+    { 18, 18, 19,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 18, 18, 19,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 18, 18, 19,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 18, 18, 19,  5,  4, 12, ZSTD_btlazy2 },  /* level 11.*/
+    { 18, 19, 19,  7,  4, 12, ZSTD_btlazy2 },  /* level 12.*/
+    { 18, 18, 19,  4,  4, 16, ZSTD_btopt   },  /* level 13 */
+    { 18, 18, 19,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/
+    { 18, 18, 19,  6,  3,128, ZSTD_btopt   },  /* level 15.*/
+    { 18, 19, 19,  6,  3,128, ZSTD_btultra },  /* level 16.*/
+    { 18, 19, 19,  8,  3,256, ZSTD_btultra },  /* level 17.*/
+    { 18, 19, 19,  6,  3,128, ZSTD_btultra2},  /* level 18.*/
+    { 18, 19, 19,  8,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 18, 19, 19, 10,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 18, 19, 19, 12,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 18, 19, 19, 13,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+{   /* for srcSize <= 128 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 17, 12, 13,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 17, 13, 15,  1,  5,  0, ZSTD_fast    },  /* level  2 */
+    { 17, 15, 16,  2,  5,  0, ZSTD_dfast   },  /* level  3 */
+    { 17, 17, 17,  2,  4,  0, ZSTD_dfast   },  /* level  4 */
+    { 17, 16, 17,  3,  4,  2, ZSTD_greedy  },  /* level  5 */
+    { 17, 16, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */
+    { 17, 16, 17,  3,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 17, 16, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 17, 16, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 17, 16, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 17, 17, 17,  5,  4,  8, ZSTD_btlazy2 },  /* level 11 */
+    { 17, 18, 17,  7,  4, 12, ZSTD_btlazy2 },  /* level 12 */
+    { 17, 18, 17,  3,  4, 12, ZSTD_btopt   },  /* level 13.*/
+    { 17, 18, 17,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/
+    { 17, 18, 17,  6,  3,256, ZSTD_btopt   },  /* level 15.*/
+    { 17, 18, 17,  6,  3,128, ZSTD_btultra },  /* level 16.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btultra },  /* level 17.*/
+    { 17, 18, 17, 10,  3,512, ZSTD_btultra },  /* level 18.*/
+    { 17, 18, 17,  5,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 17, 18, 17,  7,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 17, 18, 17,  9,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 17, 18, 17, 11,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+{   /* for srcSize <= 16 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 14, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 14, 14, 15,  1,  5,  0, ZSTD_fast    },  /* level  1 */
+    { 14, 14, 15,  1,  4,  0, ZSTD_fast    },  /* level  2 */
+    { 14, 14, 15,  2,  4,  0, ZSTD_dfast   },  /* level  3 */
+    { 14, 14, 14,  4,  4,  2, ZSTD_greedy  },  /* level  4 */
+    { 14, 14, 14,  3,  4,  4, ZSTD_lazy    },  /* level  5.*/
+    { 14, 14, 14,  4,  4,  8, ZSTD_lazy2   },  /* level  6 */
+    { 14, 14, 14,  6,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 14, 14, 14,  8,  4,  8, ZSTD_lazy2   },  /* level  8.*/
+    { 14, 15, 14,  5,  4,  8, ZSTD_btlazy2 },  /* level  9.*/
+    { 14, 15, 14,  9,  4,  8, ZSTD_btlazy2 },  /* level 10.*/
+    { 14, 15, 14,  3,  4, 12, ZSTD_btopt   },  /* level 11.*/
+    { 14, 15, 14,  4,  3, 24, ZSTD_btopt   },  /* level 12.*/
+    { 14, 15, 14,  5,  3, 32, ZSTD_btultra },  /* level 13.*/
+    { 14, 15, 15,  6,  3, 64, ZSTD_btultra },  /* level 14.*/
+    { 14, 15, 15,  7,  3,256, ZSTD_btultra },  /* level 15.*/
+    { 14, 15, 15,  5,  3, 48, ZSTD_btultra2},  /* level 16.*/
+    { 14, 15, 15,  6,  3,128, ZSTD_btultra2},  /* level 17.*/
+    { 14, 15, 15,  7,  3,256, ZSTD_btultra2},  /* level 18.*/
+    { 14, 15, 15,  8,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 14, 15, 15,  8,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 14, 15, 15,  9,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 14, 15, 15, 10,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+};
+
+
+
+#endif  /* ZSTD_CLEVELS_H */
--- a/third_party/zstd/lib/compress/fse_compress.c
+++ b/third_party/zstd/lib/compress/fse_compress.c
@ -0,0 +1,625 @@
+// clang-format off
+/* ******************************************************************
+ * FSE : Finite State Entropy encoder
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "third_party/zstd/lib/common/compiler.h"
+#include "third_party/zstd/lib/common/mem.h"        /* U32, U16, etc. */
+#include "third_party/zstd/lib/common/debug.h"      /* assert, DEBUGLOG */
+#include "third_party/zstd/lib/compress/hist.h"       /* HIST_count_wksp */
+#include "third_party/zstd/lib/common/bitstream.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "third_party/zstd/lib/common/fse.h"
+#include "third_party/zstd/lib/common/error_private.h"
+#define ZSTD_DEPS_NEED_MALLOC
+#define ZSTD_DEPS_NEED_MATH64
+#include "third_party/zstd/lib/common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
+#include "third_party/zstd/lib/common/bits.h" /* ZSTD_highbit32 */
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
+ * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                      const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                            void* workSpace, size_t wkspSize)
+{
+    U32 const tableSize = 1 << tableLog;
+    U32 const tableMask = tableSize - 1;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    U32 const step = FSE_TABLESTEP(tableSize);
+    U32 const maxSV1 = maxSymbolValue+1;
+
+    U16* cumul = (U16*)workSpace;   /* size = maxSV1 */
+    FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1+1));  /* size = tableSize */
+
+    U32 highThreshold = tableSize-1;
+
+    assert(((size_t)workSpace & 1) == 0);  /* Must be 2 bytes-aligned */
+    if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize) return ERROR(tableLog_tooLarge);
+    /* CTable header */
+    tableU16[-2] = (U16) tableLog;
+    tableU16[-1] = (U16) maxSymbolValue;
+    assert(tableLog < 16);   /* required for threshold strategy to work */
+
+    /* For explanations on how to distribute symbol values over the table :
+     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+     #ifdef __clang_analyzer__
+     ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+     #endif
+
+    /* symbol start positions */
+    {   U32 u;
+        cumul[0] = 0;
+        for (u=1; u <= maxSV1; u++) {
+            if (normalizedCounter[u-1]==-1) {  /* Low proba symbol */
+                cumul[u] = cumul[u-1] + 1;
+                tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);
+            } else {
+                assert(normalizedCounter[u-1] >= 0);
+                cumul[u] = cumul[u-1] + (U16)normalizedCounter[u-1];
+                assert(cumul[u] >= cumul[u-1]);  /* no overflow */
+        }   }
+        cumul[maxSV1] = (U16)(tableSize+1);
+    }
+
+    /* Spread symbols */
+    if (highThreshold == tableSize - 1) {
+        /* Case for no low prob count symbols. Lay down 8 bytes at a time
+         * to reduce branch misses since we are operating on a small block
+         */
+        BYTE* const spread = tableSymbol + tableSize; /* size = tableSize + 8 (may write beyond tableSize) */
+        {   U64 const add = 0x0101010101010101ull;
+            size_t pos = 0;
+            U64 sv = 0;
+            U32 s;
+            for (s=0; s<maxSV1; ++s, sv += add) {
+                int i;
+                int const n = normalizedCounter[s];
+                MEM_write64(spread + pos, sv);
+                for (i = 8; i < n; i += 8) {
+                    MEM_write64(spread + pos + i, sv);
+                }
+                assert(n>=0);
+                pos += (size_t)n;
+            }
+        }
+        /* Spread symbols across the table. Lack of lowprob symbols means that
+         * we don't need variable sized inner loop, so we can unroll the loop and
+         * reduce branch misses.
+         */
+        {   size_t position = 0;
+            size_t s;
+            size_t const unroll = 2; /* Experimentally determined optimal unroll */
+            assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
+            for (s = 0; s < (size_t)tableSize; s += unroll) {
+                size_t u;
+                for (u = 0; u < unroll; ++u) {
+                    size_t const uPosition = (position + (u * step)) & tableMask;
+                    tableSymbol[uPosition] = spread[s + u];
+                }
+                position = (position + (unroll * step)) & tableMask;
+            }
+            assert(position == 0);   /* Must have initialized all positions */
+        }
+    } else {
+        U32 position = 0;
+        U32 symbol;
+        for (symbol=0; symbol<maxSV1; symbol++) {
+            int nbOccurrences;
+            int const freq = normalizedCounter[symbol];
+            for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) {
+                tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+                position = (position + step) & tableMask;
+                while (position > highThreshold)
+                    position = (position + step) & tableMask;   /* Low proba area */
+        }   }
+        assert(position==0);  /* Must have initialized all positions */
+    }
+
+    /* Build table */
+    {   U32 u; for (u=0; u<tableSize; u++) {
+        FSE_FUNCTION_TYPE s = tableSymbol[u];   /* note : static analyzer may not understand tableSymbol is properly initialized */
+        tableU16[cumul[s]++] = (U16) (tableSize+u);   /* TableU16 : sorted by symbol order; gives next state value */
+    }   }
+
+    /* Build Symbol Transformation Table */
+    {   unsigned total = 0;
+        unsigned s;
+        for (s=0; s<=maxSymbolValue; s++) {
+            switch (normalizedCounter[s])
+            {
+            case  0:
+                /* filling nonetheless, for compatibility with FSE_getMaxNbBits() */
+                symbolTT[s].deltaNbBits = ((tableLog+1) << 16) - (1<<tableLog);
+                break;
+
+            case -1:
+            case  1:
+                symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
+                assert(total <= INT_MAX);
+                symbolTT[s].deltaFindState = (int)(total - 1);
+                total ++;
+                break;
+            default :
+                assert(normalizedCounter[s] > 1);
+                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
+                    U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
+                    symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                    symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
+                    total +=  (unsigned)normalizedCounter[s];
+    }   }   }   }
+
+#if 0  /* debug : symbol costs */
+    DEBUGLOG(5, "\n --- table statistics : ");
+    {   U32 symbol;
+        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+            DEBUGLOG(5, "%3u: w=%3i,   maxBits=%u, fracBits=%.2f",
+                symbol, normalizedCounter[symbol],
+                FSE_getMaxNbBits(symbolTT, symbol),
+                (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256);
+    }   }
+#endif
+
+    return 0;
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-**************************************************************
+*  FSE NCount encoding
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog
+                                   + 4 /* bitCount initialized at 4 */
+                                   + 2 /* first two symbols may use one additional bit each */) / 8)
+                                    + 1 /* round up to whole nb bytes */
+                                    + 2 /* additional two bytes for bitstream flush */;
+    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+}
+
+static size_t
+FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                   const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                         unsigned writeIsSafe)
+{
+    BYTE* const ostart = (BYTE*) header;
+    BYTE* out = ostart;
+    BYTE* const oend = ostart + headerBufferSize;
+    int nbBits;
+    const int tableSize = 1 << tableLog;
+    int remaining;
+    int threshold;
+    U32 bitStream = 0;
+    int bitCount = 0;
+    unsigned symbol = 0;
+    unsigned const alphabetSize = maxSymbolValue + 1;
+    int previousIs0 = 0;
+
+    /* Table Size */
+    bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
+    bitCount  += 4;
+
+    /* Init */
+    remaining = tableSize+1;   /* +1 for extra accuracy */
+    threshold = tableSize;
+    nbBits = tableLog+1;
+
+    while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+        if (previousIs0) {
+            unsigned start = symbol;
+            while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++;
+            if (symbol == alphabetSize) break;   /* incorrect distribution */
+            while (symbol >= start+24) {
+                start+=24;
+                bitStream += 0xFFFFU << bitCount;
+                if ((!writeIsSafe) && (out > oend-2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE) bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out+=2;
+                bitStream>>=16;
+            }
+            while (symbol >= start+3) {
+                start+=3;
+                bitStream += 3 << bitCount;
+                bitCount += 2;
+            }
+            bitStream += (symbol-start) << bitCount;
+            bitCount += 2;
+            if (bitCount>16) {
+                if ((!writeIsSafe) && (out > oend - 2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE)bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out += 2;
+                bitStream >>= 16;
+                bitCount -= 16;
+        }   }
+        {   int count = normalizedCounter[symbol++];
+            int const max = (2*threshold-1) - remaining;
+            remaining -= count < 0 ? -count : count;
+            count++;   /* +1 for extra accuracy */
+            if (count>=threshold)
+                count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            bitStream += count << bitCount;
+            bitCount  += nbBits;
+            bitCount  -= (count<max);
+            previousIs0  = (count==1);
+            if (remaining<1) return ERROR(GENERIC);
+            while (remaining<threshold) { nbBits--; threshold>>=1; }
+        }
+        if (bitCount>16) {
+            if ((!writeIsSafe) && (out > oend - 2))
+                return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            out[0] = (BYTE)bitStream;
+            out[1] = (BYTE)(bitStream>>8);
+            out += 2;
+            bitStream >>= 16;
+            bitCount -= 16;
+    }   }
+
+    if (remaining != 1)
+        return ERROR(GENERIC);  /* incorrect normalized distribution */
+    assert(symbol <= alphabetSize);
+
+    /* flush remaining bitStream */
+    if ((!writeIsSafe) && (out > oend - 2))
+        return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    out[0] = (BYTE)bitStream;
+    out[1] = (BYTE)(bitStream>>8);
+    out+= (bitCount+7) /8;
+
+    return (out-ostart);
+}
+
+
+size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                  const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
+
+    if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+        return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */);
+}
+
+
+/*-**************************************************************
+*  FSE Compression Code
+****************************************************************/
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
+    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
+    U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    return minBits;
+}
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+{
+    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
+    U32 tableLog = maxTableLog;
+    U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (maxBitsSrc < tableLog) tableLog = maxBitsSrc;   /* Accuracy can be reduced */
+    if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
+    if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
+    if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
+    return tableLog;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
+}
+
+/* Secondary normalization method.
+   To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue, short lowProbCount)
+{
+    short const NOT_YET_ASSIGNED = -2;
+    U32 s;
+    U32 distributed = 0;
+    U32 ToDistribute;
+
+    /* Init */
+    U32 const lowThreshold = (U32)(total >> tableLog);
+    U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+    for (s=0; s<=maxSymbolValue; s++) {
+        if (count[s] == 0) {
+            norm[s]=0;
+            continue;
+        }
+        if (count[s] <= lowThreshold) {
+            norm[s] = lowProbCount;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+        if (count[s] <= lowOne) {
+            norm[s] = 1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+
+        norm[s]=NOT_YET_ASSIGNED;
+    }
+    ToDistribute = (1 << tableLog) - distributed;
+
+    if (ToDistribute == 0)
+        return 0;
+
+    if ((total / ToDistribute) > lowOne) {
+        /* risk of rounding to zero */
+        lowOne = (U32)((total * 3) / (ToDistribute * 2));
+        for (s=0; s<=maxSymbolValue; s++) {
+            if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
+                norm[s] = 1;
+                distributed++;
+                total -= count[s];
+                continue;
+        }   }
+        ToDistribute = (1 << tableLog) - distributed;
+    }
+
+    if (distributed == maxSymbolValue+1) {
+        /* all values are pretty poor;
+           probably incompressible data (should have already been detected);
+           find max, then give all remaining points to max */
+        U32 maxV = 0, maxC = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > maxC) { maxV=s; maxC=count[s]; }
+        norm[maxV] += (short)ToDistribute;
+        return 0;
+    }
+
+    if (total == 0) {
+        /* all of the symbols were low enough for the lowOne or lowThreshold */
+        for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1))
+            if (norm[s] > 0) { ToDistribute--; norm[s]++; }
+        return 0;
+    }
+
+    {   U64 const vStepLog = 62 - tableLog;
+        U64 const mid = (1ULL << (vStepLog-1)) - 1;
+        U64 const rStep = ZSTD_div64((((U64)1<<vStepLog) * ToDistribute) + mid, (U32)total);   /* scale on remaining */
+        U64 tmpTotal = mid;
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (norm[s]==NOT_YET_ASSIGNED) {
+                U64 const end = tmpTotal + (count[s] * rStep);
+                U32 const sStart = (U32)(tmpTotal >> vStepLog);
+                U32 const sEnd = (U32)(end >> vStepLog);
+                U32 const weight = sEnd - sStart;
+                if (weight < 1)
+                    return ERROR(GENERIC);
+                norm[s] = (short)weight;
+                tmpTotal = end;
+    }   }   }
+
+    return 0;
+}
+
+size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+                           const unsigned* count, size_t total,
+                           unsigned maxSymbolValue, unsigned useLowProbCount)
+{
+    /* Sanity checks */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported size */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported size */
+    if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC);   /* Too small tableLog, compression potentially impossible */
+
+    {   static U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
+        short const lowProbCount = useLowProbCount ? -1 : 1;
+        U64 const scale = 62 - tableLog;
+        U64 const step = ZSTD_div64((U64)1<<62, (U32)total);   /* <== here, one division ! */
+        U64 const vStep = 1ULL<<(scale-20);
+        int stillToDistribute = 1<<tableLog;
+        unsigned s;
+        unsigned largest=0;
+        short largestP=0;
+        U32 lowThreshold = (U32)(total >> tableLog);
+
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (count[s] == total) return 0;   /* rle special case */
+            if (count[s] == 0) { normalizedCounter[s]=0; continue; }
+            if (count[s] <= lowThreshold) {
+                normalizedCounter[s] = lowProbCount;
+                stillToDistribute--;
+            } else {
+                short proba = (short)((count[s]*step) >> scale);
+                if (proba<8) {
+                    U64 restToBeat = vStep * rtbTable[proba];
+                    proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
+                }
+                if (proba > largestP) { largestP=proba; largest=s; }
+                normalizedCounter[s] = proba;
+                stillToDistribute -= proba;
+        }   }
+        if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
+            /* corner case, need another normalization method */
+            size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue, lowProbCount);
+            if (FSE_isError(errorCode)) return errorCode;
+        }
+        else normalizedCounter[largest] += (short)stillToDistribute;
+    }
+
+#if 0
+    {   /* Print Table (debug) */
+        U32 s;
+        U32 nTotal = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]);
+        for (s=0; s<=maxSymbolValue; s++)
+            nTotal += abs(normalizedCounter[s]);
+        if (nTotal != (1U<<tableLog))
+            RAWLOG(2, "Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
+        getchar();
+    }
+#endif
+
+    return tableLog;
+}
+
+/* fake FSE_CTable, for rle input (always same symbol) */
+size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+{
+    void* ptr = ct;
+    U16* tableU16 = ( (U16*) ptr) + 2;
+    void* FSCTptr = (U32*)ptr + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) FSCTptr;
+
+    /* header */
+    tableU16[-2] = (U16) 0;
+    tableU16[-1] = (U16) symbolValue;
+
+    /* Build table */
+    tableU16[0] = 0;
+    tableU16[1] = 0;   /* just in case */
+
+    /* Build Symbol Transformation Table */
+    symbolTT[symbolValue].deltaNbBits = 0;
+    symbolTT[symbolValue].deltaFindState = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct, const unsigned fast)
+{
+    const BYTE* const istart = (const BYTE*) src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip=iend;
+
+    BIT_CStream_t bitC;
+    FSE_CState_t CState1, CState2;
+
+    /* init */
+    if (srcSize <= 2) return 0;
+    { size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
+      if (FSE_isError(initError)) return 0; /* not enough space available to write a bitstream */ }
+
+#define FSE_FLUSHBITS(s)  (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+    if (srcSize & 1) {
+        FSE_initCState2(&CState1, ct, *--ip);
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    } else {
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_initCState2(&CState1, ct, *--ip);
+    }
+
+    /* join to mod 4 */
+    srcSize -= 2;
+    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) {  /* test bit 2 */
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    /* 2 or 4 encoding per loop */
+    while ( ip>istart ) {
+
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
+            FSE_FLUSHBITS(&bitC);
+
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) {  /* this test must be static */
+            FSE_encodeSymbol(&bitC, &CState2, *--ip);
+            FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        }
+
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    FSE_flushCState(&bitC, &CState2);
+    FSE_flushCState(&bitC, &CState1);
+    return BIT_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct)
+{
+    unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+    if (fast)
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+    else
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+
+#endif   /* FSE_COMMONDEFS_ONLY */
--- a/third_party/zstd/lib/compress/hist.c
+++ b/third_party/zstd/lib/compress/hist.c
@ -0,0 +1,182 @@
+// clang-format off
+/* ******************************************************************
+ * hist : Histogram functions
+ * part of Finite State Entropy project
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* --- dependencies --- */
+#include "third_party/zstd/lib/common/mem.h"             /* U32, BYTE, etc. */
+#include "third_party/zstd/lib/common/debug.h"           /* assert, DEBUGLOG */
+#include "third_party/zstd/lib/common/error_private.h"   /* ERROR */
+#include "third_party/zstd/lib/compress/hist.h"
+
+
+/* --- Error management --- */
+unsigned HIST_isError(size_t code) { return ERR_isError(code); }
+
+/*-**************************************************************
+ *  Histogram functions
+ ****************************************************************/
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const end = ip + srcSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned largestCount=0;
+
+    ZSTD_memset(count, 0, (maxSymbolValue+1) * sizeof(*count));
+    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
+
+    while (ip<end) {
+        assert(*ip <= maxSymbolValue);
+        count[*ip++]++;
+    }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+
+    {   U32 s;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > largestCount) largestCount = count[s];
+    }
+
+    return largestCount;
+}
+
+typedef enum { trustInput, checkMaxSymbolValue } HIST_checkInput_e;
+
+/* HIST_count_parallel_wksp() :
+ * store histogram into 4 intermediate tables, recombined at the end.
+ * this design makes better use of OoO cpus,
+ * and is noticeably faster when some values are heavily repeated.
+ * But it needs some additional workspace for intermediate tables.
+ * `workSpace` must be a U32 table of size >= HIST_WKSP_SIZE_U32.
+ * @return : largest histogram frequency,
+ *           or an error code (notably when histogram's alphabet is larger than *maxSymbolValuePtr) */
+static size_t HIST_count_parallel_wksp(
+                                unsigned* count, unsigned* maxSymbolValuePtr,
+                                const void* source, size_t sourceSize,
+                                HIST_checkInput_e check,
+                                U32* const workSpace)
+{
+    const BYTE* ip = (const BYTE*)source;
+    const BYTE* const iend = ip+sourceSize;
+    size_t const countSize = (*maxSymbolValuePtr + 1) * sizeof(*count);
+    unsigned max=0;
+    U32* const Counting1 = workSpace;
+    U32* const Counting2 = Counting1 + 256;
+    U32* const Counting3 = Counting2 + 256;
+    U32* const Counting4 = Counting3 + 256;
+
+    /* safety checks */
+    assert(*maxSymbolValuePtr <= 255);
+    if (!sourceSize) {
+        ZSTD_memset(count, 0, countSize);
+        *maxSymbolValuePtr = 0;
+        return 0;
+    }
+    ZSTD_memset(workSpace, 0, 4*256*sizeof(unsigned));
+
+    /* by stripes of 16 bytes */
+    {   U32 cached = MEM_read32(ip); ip += 4;
+        while (ip < iend-15) {
+            U32 c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+        }
+        ip-=4;
+    }
+
+    /* finish last symbols */
+    while (ip<iend) Counting1[*ip++]++;
+
+    {   U32 s;
+        for (s=0; s<256; s++) {
+            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+            if (Counting1[s] > max) max = Counting1[s];
+    }   }
+
+    {   unsigned maxSymbolValue = 255;
+        while (!Counting1[maxSymbolValue]) maxSymbolValue--;
+        if (check && maxSymbolValue > *maxSymbolValuePtr) return ERROR(maxSymbolValue_tooSmall);
+        *maxSymbolValuePtr = maxSymbolValue;
+        ZSTD_memmove(count, Counting1, countSize);   /* in case count & Counting1 are overlapping */
+    }
+    return (size_t)max;
+}
+
+/* HIST_countFast_wksp() :
+ * Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                          const void* source, size_t sourceSize,
+                          void* workSpace, size_t workSpaceSize)
+{
+    if (sourceSize < 1500) /* heuristic threshold */
+        return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+    if ((size_t)workSpace & 3) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall);
+    return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, trustInput, (U32*)workSpace);
+}
+
+/* HIST_count_wksp() :
+ * Same as HIST_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                       const void* source, size_t sourceSize,
+                       void* workSpace, size_t workSpaceSize)
+{
+    if ((size_t)workSpace & 3) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall);
+    if (*maxSymbolValuePtr < 255)
+        return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue, (U32*)workSpace);
+    *maxSymbolValuePtr = 255;
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace, workSpaceSize);
+}
+
+#ifndef ZSTD_NO_UNUSED_FUNCTIONS
+/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                     const void* source, size_t sourceSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters, sizeof(tmpCounters));
+}
+
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* src, size_t srcSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters, sizeof(tmpCounters));
+}
+#endif
--- a/third_party/zstd/lib/compress/hist.h
+++ b/third_party/zstd/lib/compress/hist.h
@ -0,0 +1,77 @@
+// clang-format off
+/* ******************************************************************
+ * hist : Histogram functions
+ * part of Finite State Entropy project
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* --- dependencies --- */
+#include "third_party/zstd/lib/common/zstd_deps.h"   /* size_t */
+#include "libc/assert.h"
+
+
+/* --- simple histogram functions --- */
+
+/*! HIST_count():
+ *  Provides the precise count of each byte within a table 'count'.
+ * 'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
+ *  Updates *maxSymbolValuePtr with actual largest symbol value detected.
+ * @return : count of the most frequent symbol (which isn't identified).
+ *           or an error code, which can be tested using HIST_isError().
+ *           note : if return == srcSize, there is only one symbol.
+ */
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                  const void* src, size_t srcSize);
+
+unsigned HIST_isError(size_t code);  /**< tells if a return value is an error code */
+
+
+/* --- advanced histogram functions --- */
+
+#define HIST_WKSP_SIZE_U32 1024
+#define HIST_WKSP_SIZE    (HIST_WKSP_SIZE_U32 * sizeof(unsigned))
+/** HIST_count_wksp() :
+ *  Same as HIST_count(), but using an externally provided scratch buffer.
+ *  Benefit is this function will use very little stack space.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                       const void* src, size_t srcSize,
+                       void* workSpace, size_t workSpaceSize);
+
+/** HIST_countFast() :
+ *  same as HIST_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr.
+ *  This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr`
+ */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                      const void* src, size_t srcSize);
+
+/** HIST_countFast_wksp() :
+ *  Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize,
+                           void* workSpace, size_t workSpaceSize);
+
+/*! HIST_count_simple() :
+ *  Same as HIST_countFast(), this function is unsafe,
+ *  and will segfault if any value within `src` is `> *maxSymbolValuePtr`.
+ *  It is also a bit slower for large inputs.
+ *  However, it does not need any additional memory (not even on stack).
+ * @return : count of the most frequent symbol.
+ *  Note this function doesn't produce any error (i.e. it must succeed).
+ */
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize);
--- a/third_party/zstd/lib/compress/huf_compress.c
+++ b/third_party/zstd/lib/compress/huf_compress.c
--- a/third_party/zstd/lib/compress/zstd_compress.c
+++ b/third_party/zstd/lib/compress/zstd_compress.c
--- a/third_party/zstd/lib/compress/zstd_compress_internal.h
+++ b/third_party/zstd/lib/compress/zstd_compress_internal.h
--- a/third_party/zstd/lib/compress/zstd_compress_literals.c
+++ b/third_party/zstd/lib/compress/zstd_compress_literals.c
@ -0,0 +1,236 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+#include "third_party/zstd/lib/compress/zstd_compress_literals.h"
+
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+#if DEBUGLEVEL >= 2
+
+static size_t showHexa(const void* src, size_t srcSize)
+{
+    const BYTE* const ip = (const BYTE*)src;
+    size_t u;
+    for (u=0; u<srcSize; u++) {
+        RAWLOG(5, " %02X", ip[u]); (void)ip;
+    }
+    RAWLOG(5, " \n");
+    return srcSize;
+}
+
+#endif
+
+
+/* **************************************************************
+*  Literals compression - special cases
+****************************************************************/
+size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
+
+    RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_basic + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_basic + (1<<2) + (srcSize<<4)));
+            break;
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_basic + (3<<2) + (srcSize<<4)));
+            break;
+        default:   /* not necessary : flSize is {1,2,3} */
+            assert(0);
+    }
+
+    ZSTD_memcpy(ostart + flSize, src, srcSize);
+    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+    return srcSize + flSize;
+}
+
+static int allBytesIdentical(const void* src, size_t srcSize)
+{
+    assert(srcSize >= 1);
+    assert(src != NULL);
+    {   const BYTE b = ((const BYTE*)src)[0];
+        size_t p;
+        for (p=1; p<srcSize; p++) {
+            if (((const BYTE*)src)[p] != b) return 0;
+        }
+        return 1;
+    }
+}
+
+size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    assert(dstCapacity >= 4); (void)dstCapacity;
+    assert(allBytesIdentical(src, srcSize));
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_rle + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_rle + (1<<2) + (srcSize<<4)));
+            break;
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_rle + (3<<2) + (srcSize<<4)));
+            break;
+        default:   /* not necessary : flSize is {1,2,3} */
+            assert(0);
+    }
+
+    ostart[flSize] = *(const BYTE*)src;
+    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
+    return flSize+1;
+}
+
+/* ZSTD_minLiteralsToCompress() :
+ * returns minimal amount of literals
+ * for literal compression to even be attempted.
+ * Minimum is made tighter as compression strategy increases.
+ */
+static size_t
+ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
+{
+    assert((int)strategy >= 0);
+    assert((int)strategy <= 9);
+    /* btultra2 : min 8 bytes;
+     * then 2x larger for each successive compression strategy
+     * max threshold 64 bytes */
+    {   int const shift = MIN(9-(int)strategy, 3);
+        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
+        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
+        return mintc;
+    }
+}
+
+size_t ZSTD_compressLiterals (
+                  void* dst, size_t dstCapacity,
+            const void* src, size_t srcSize,
+                  void* entropyWorkspace, size_t entropyWorkspaceSize,
+            const ZSTD_hufCTables_t* prevHuf,
+                  ZSTD_hufCTables_t* nextHuf,
+                  ZSTD_strategy strategy,
+                  int disableLiteralCompression,
+                  int suspectUncompressible,
+                  int bmi2)
+{
+    size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+    BYTE*  const ostart = (BYTE*)dst;
+    U32 singleStream = srcSize < 256;
+    symbolEncodingType_e hType = set_compressed;
+    size_t cLitSize;
+
+    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
+                disableLiteralCompression, (U32)srcSize, dstCapacity);
+
+    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
+
+    /* Prepare nextEntropy assuming reusing the existing table */
+    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+
+    if (disableLiteralCompression)
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+
+    /* if too small, don't even attempt compression (speed opt) */
+    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+
+    RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+    {   HUF_repeat repeat = prevHuf->repeatMode;
+        int const flags = 0
+            | (bmi2 ? HUF_flags_bmi2 : 0)
+            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
+            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
+            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
+
+        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
+        huf_compress_f huf_compress;
+        if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
+        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
+                                src, srcSize,
+                                HUF_SYMBOLVALUE_MAX, LitHufLog,
+                                entropyWorkspace, entropyWorkspaceSize,
+                                (HUF_CElt*)nextHuf->CTable,
+                                &repeat, flags);
+        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
+        if (repeat != HUF_repeat_none) {
+            /* reused the existing table */
+            DEBUGLOG(5, "reusing statistics from previous huffman block");
+            hType = set_repeat;
+        }
+    }
+
+    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
+        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }   }
+    if (cLitSize==1) {
+        /* A return value of 1 signals that the alphabet consists of a single symbol.
+         * However, in some rare circumstances, it could be the compressed size (a single byte).
+         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
+         * (it's also necessary to not generate statistics).
+         * Therefore, in such a case, actively check that all bytes are identical. */
+        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+    }   }
+
+    if (hType == set_compressed) {
+        /* using a newly constructed table */
+        nextHuf->repeatMode = HUF_repeat_check;
+    }
+
+    /* Build header */
+    switch(lhSize)
+    {
+    case 3: /* 2 - 2 - 10 - 10 */
+        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+            MEM_writeLE24(ostart, lhc);
+            break;
+        }
+    case 4: /* 2 - 2 - 14 - 14 */
+        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+        {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+            MEM_writeLE32(ostart, lhc);
+            break;
+        }
+    case 5: /* 2 - 2 - 18 - 18 */
+        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+        {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+            MEM_writeLE32(ostart, lhc);
+            ostart[4] = (BYTE)(cLitSize >> 10);
+            break;
+        }
+    default:  /* not possible : lhSize is {3,4,5} */
+        assert(0);
+    }
+    DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)srcSize, (U32)(lhSize+cLitSize));
+    return lhSize+cLitSize;
+}
--- a/third_party/zstd/lib/compress/zstd_compress_literals.h
+++ b/third_party/zstd/lib/compress/zstd_compress_literals.h
@ -0,0 +1,40 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_LITERALS_H
+#define ZSTD_COMPRESS_LITERALS_H
+
+#include "third_party/zstd/lib/compress/zstd_compress_internal.h" /* ZSTD_hufCTables_t, ZSTD_minGain() */
+
+
+size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* ZSTD_compressRleLiteralsBlock() :
+ * Conditions :
+ * - All bytes in @src are identical
+ * - dstCapacity >= 4 */
+size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* ZSTD_compressLiterals():
+ * @entropyWorkspace: must be aligned on 4-bytes boundaries
+ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
+ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
+ */
+size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+                        const ZSTD_hufCTables_t* prevHuf,
+                              ZSTD_hufCTables_t* nextHuf,
+                              ZSTD_strategy strategy, int disableLiteralCompression,
+                              int suspectUncompressible,
+                              int bmi2);
+
+#endif /* ZSTD_COMPRESS_LITERALS_H */
--- a/third_party/zstd/lib/compress/zstd_compress_sequences.c
+++ b/third_party/zstd/lib/compress/zstd_compress_sequences.c
@ -0,0 +1,443 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+#include "third_party/zstd/lib/compress/zstd_compress_sequences.h"
+
+/**
+ * -log2(x / 256) lookup table for x in [0, 256).
+ * If x == 0: Return 0
+ * Else: Return floor(-log2(x / 256) * 256)
+ */
+static unsigned const kInverseProbabilityLog256[256] = {
+    0,    2048, 1792, 1642, 1536, 1453, 1386, 1329, 1280, 1236, 1197, 1162,
+    1130, 1100, 1073, 1047, 1024, 1001, 980,  960,  941,  923,  906,  889,
+    874,  859,  844,  830,  817,  804,  791,  779,  768,  756,  745,  734,
+    724,  714,  704,  694,  685,  676,  667,  658,  650,  642,  633,  626,
+    618,  610,  603,  595,  588,  581,  574,  567,  561,  554,  548,  542,
+    535,  529,  523,  517,  512,  506,  500,  495,  489,  484,  478,  473,
+    468,  463,  458,  453,  448,  443,  438,  434,  429,  424,  420,  415,
+    411,  407,  402,  398,  394,  390,  386,  382,  377,  373,  370,  366,
+    362,  358,  354,  350,  347,  343,  339,  336,  332,  329,  325,  322,
+    318,  315,  311,  308,  305,  302,  298,  295,  292,  289,  286,  282,
+    279,  276,  273,  270,  267,  264,  261,  258,  256,  253,  250,  247,
+    244,  241,  239,  236,  233,  230,  228,  225,  222,  220,  217,  215,
+    212,  209,  207,  204,  202,  199,  197,  194,  192,  190,  187,  185,
+    182,  180,  178,  175,  173,  171,  168,  166,  164,  162,  159,  157,
+    155,  153,  151,  149,  146,  144,  142,  140,  138,  136,  134,  132,
+    130,  128,  126,  123,  121,  119,  117,  115,  114,  112,  110,  108,
+    106,  104,  102,  100,  98,   96,   94,   93,   91,   89,   87,   85,
+    83,   82,   80,   78,   76,   74,   73,   71,   69,   67,   66,   64,
+    62,   61,   59,   57,   55,   54,   52,   50,   49,   47,   46,   44,
+    42,   41,   39,   37,   36,   34,   33,   31,   30,   28,   26,   25,
+    23,   22,   20,   19,   17,   16,   14,   13,   11,   10,   8,    7,
+    5,    4,    2,    1,
+};
+
+static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const* ctable) {
+  void const* ptr = ctable;
+  U16 const* u16ptr = (U16 const*)ptr;
+  U32 const maxSymbolValue = MEM_read16(u16ptr + 1);
+  return maxSymbolValue;
+}
+
+/**
+ * Returns true if we should use ncount=-1 else we should
+ * use ncount=1 for low probability symbols instead.
+ */
+static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
+{
+    /* Heuristic: This should cover most blocks <= 16K and
+     * start to fade out after 16K to about 32K depending on
+     * compressibility.
+     */
+    return nbSeq >= 2048;
+}
+
+/**
+ * Returns the cost in bytes of encoding the normalized count header.
+ * Returns an error if any of the helper functions return an error.
+ */
+static size_t ZSTD_NCountCost(unsigned const* count, unsigned const max,
+                              size_t const nbSeq, unsigned const FSELog)
+{
+    BYTE wksp[FSE_NCOUNTBOUND];
+    S16 norm[MaxSeq + 1];
+    const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+    FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max, ZSTD_useLowProbCount(nbSeq)), "");
+    return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog);
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution described by count
+ * using the entropy bound.
+ */
+static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t const total)
+{
+    unsigned cost = 0;
+    unsigned s;
+
+    assert(total > 0);
+    for (s = 0; s <= max; ++s) {
+        unsigned norm = (unsigned)((256 * count[s]) / total);
+        if (count[s] != 0 && norm == 0)
+            norm = 1;
+        assert(count[s] < total);
+        cost += count[s] * kInverseProbabilityLog256[norm];
+    }
+    return cost >> 8;
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using ctable.
+ * Returns an error if ctable cannot represent all the symbols in count.
+ */
+size_t ZSTD_fseBitCost(
+    FSE_CTable const* ctable,
+    unsigned const* count,
+    unsigned const max)
+{
+    unsigned const kAccuracyLog = 8;
+    size_t cost = 0;
+    unsigned s;
+    FSE_CState_t cstate;
+    FSE_initCState(&cstate, ctable);
+    if (ZSTD_getFSEMaxSymbolValue(ctable) < max) {
+        DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u",
+                    ZSTD_getFSEMaxSymbolValue(ctable), max);
+        return ERROR(GENERIC);
+    }
+    for (s = 0; s <= max; ++s) {
+        unsigned const tableLog = cstate.stateLog;
+        unsigned const badCost = (tableLog + 1) << kAccuracyLog;
+        unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog);
+        if (count[s] == 0)
+            continue;
+        if (bitCost >= badCost) {
+            DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s);
+            return ERROR(GENERIC);
+        }
+        cost += (size_t)count[s] * bitCost;
+    }
+    return cost >> kAccuracyLog;
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using the
+ * table described by norm. The max symbol support by norm is assumed >= max.
+ * norm must be valid for every symbol with non-zero probability in count.
+ */
+size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+                             unsigned const* count, unsigned const max)
+{
+    unsigned const shift = 8 - accuracyLog;
+    size_t cost = 0;
+    unsigned s;
+    assert(accuracyLog <= 8);
+    for (s = 0; s <= max; ++s) {
+        unsigned const normAcc = (norm[s] != -1) ? (unsigned)norm[s] : 1;
+        unsigned const norm256 = normAcc << shift;
+        assert(norm256 > 0);
+        assert(norm256 < 256);
+        cost += count[s] * kInverseProbabilityLog256[norm256];
+    }
+    return cost >> 8;
+}
+
+symbolEncodingType_e
+ZSTD_selectEncodingType(
+        FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+        size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+        FSE_CTable const* prevCTable,
+        short const* defaultNorm, U32 defaultNormLog,
+        ZSTD_defaultPolicy_e const isDefaultAllowed,
+        ZSTD_strategy const strategy)
+{
+    ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0);
+    if (mostFrequent == nbSeq) {
+        *repeatMode = FSE_repeat_none;
+        if (isDefaultAllowed && nbSeq <= 2) {
+            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
+             * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+             * If basic encoding isn't possible, always choose RLE.
+             */
+            DEBUGLOG(5, "Selected set_basic");
+            return set_basic;
+        }
+        DEBUGLOG(5, "Selected set_rle");
+        return set_rle;
+    }
+    if (strategy < ZSTD_lazy) {
+        if (isDefaultAllowed) {
+            size_t const staticFse_nbSeq_max = 1000;
+            size_t const mult = 10 - strategy;
+            size_t const baseLog = 3;
+            size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog;  /* 28-36 for offset, 56-72 for lengths */
+            assert(defaultNormLog >= 5 && defaultNormLog <= 6);  /* xx_DEFAULTNORMLOG */
+            assert(mult <= 9 && mult >= 7);
+            if ( (*repeatMode == FSE_repeat_valid)
+              && (nbSeq < staticFse_nbSeq_max) ) {
+                DEBUGLOG(5, "Selected set_repeat");
+                return set_repeat;
+            }
+            if ( (nbSeq < dynamicFse_nbSeq_min)
+              || (mostFrequent < (nbSeq >> (defaultNormLog-1))) ) {
+                DEBUGLOG(5, "Selected set_basic");
+                /* The format allows default tables to be repeated, but it isn't useful.
+                 * When using simple heuristics to select encoding type, we don't want
+                 * to confuse these tables with dictionaries. When running more careful
+                 * analysis, we don't need to waste time checking both repeating tables
+                 * and default tables.
+                 */
+                *repeatMode = FSE_repeat_none;
+                return set_basic;
+            }
+        }
+    } else {
+        size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC);
+        size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC);
+        size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog);
+        size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq);
+
+        if (isDefaultAllowed) {
+            assert(!ZSTD_isError(basicCost));
+            assert(!(*repeatMode == FSE_repeat_valid && ZSTD_isError(repeatCost)));
+        }
+        assert(!ZSTD_isError(NCountCost));
+        assert(compressedCost < ERROR(maxCode));
+        DEBUGLOG(5, "Estimated bit costs: basic=%u\trepeat=%u\tcompressed=%u",
+                    (unsigned)basicCost, (unsigned)repeatCost, (unsigned)compressedCost);
+        if (basicCost <= repeatCost && basicCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_basic");
+            assert(isDefaultAllowed);
+            *repeatMode = FSE_repeat_none;
+            return set_basic;
+        }
+        if (repeatCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_repeat");
+            assert(!ZSTD_isError(repeatCost));
+            return set_repeat;
+        }
+        assert(compressedCost < basicCost && compressedCost < repeatCost);
+    }
+    DEBUGLOG(5, "Selected set_compressed");
+    *repeatMode = FSE_repeat_check;
+    return set_compressed;
+}
+
+typedef struct {
+    S16 norm[MaxSeq + 1];
+    U32 wksp[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(MaxSeq, MaxFSELog)];
+} ZSTD_BuildCTableWksp;
+
+size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+                unsigned* count, U32 max,
+                const BYTE* codeTable, size_t nbSeq,
+                const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                const FSE_CTable* prevCTable, size_t prevCTableSize,
+                void* entropyWorkspace, size_t entropyWorkspaceSize)
+{
+    BYTE* op = (BYTE*)dst;
+    const BYTE* const oend = op + dstCapacity;
+    DEBUGLOG(6, "ZSTD_buildCTable (dstCapacity=%u)", (unsigned)dstCapacity);
+
+    switch (type) {
+    case set_rle:
+        FORWARD_IF_ERROR(FSE_buildCTable_rle(nextCTable, (BYTE)max), "");
+        RETURN_ERROR_IF(dstCapacity==0, dstSize_tooSmall, "not enough space");
+        *op = codeTable[0];
+        return 1;
+    case set_repeat:
+        ZSTD_memcpy(nextCTable, prevCTable, prevCTableSize);
+        return 0;
+    case set_basic:
+        FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, entropyWorkspace, entropyWorkspaceSize), "");  /* note : could be pre-calculated */
+        return 0;
+    case set_compressed: {
+        ZSTD_BuildCTableWksp* wksp = (ZSTD_BuildCTableWksp*)entropyWorkspace;
+        size_t nbSeq_1 = nbSeq;
+        const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+        if (count[codeTable[nbSeq-1]] > 1) {
+            count[codeTable[nbSeq-1]]--;
+            nbSeq_1--;
+        }
+        assert(nbSeq_1 > 1);
+        assert(entropyWorkspaceSize >= sizeof(ZSTD_BuildCTableWksp));
+        (void)entropyWorkspaceSize;
+        FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed");
+        assert(oend >= op);
+        {   size_t const NCountSize = FSE_writeNCount(op, (size_t)(oend - op), wksp->norm, max, tableLog);   /* overflow protected */
+            FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed");
+            FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp failed");
+            return NCountSize;
+        }
+    }
+    default: assert(0); RETURN_ERROR(GENERIC, "impossible to reach");
+    }
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_encodeSequences_body(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    BIT_CStream_t blockStream;
+    FSE_CState_t  stateMatchLength;
+    FSE_CState_t  stateOffsetBits;
+    FSE_CState_t  stateLitLength;
+
+    RETURN_ERROR_IF(
+        ERR_isError(BIT_initCStream(&blockStream, dst, dstCapacity)),
+        dstSize_tooSmall, "not enough space remaining");
+    DEBUGLOG(6, "available space for bitstream : %i  (dstCapacity=%u)",
+                (int)(blockStream.endPtr - blockStream.startPtr),
+                (unsigned)dstCapacity);
+
+    /* first symbols */
+    FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]);
+    FSE_initCState2(&stateOffsetBits,  CTable_OffsetBits,  ofCodeTable[nbSeq-1]);
+    FSE_initCState2(&stateLitLength,   CTable_LitLength,   llCodeTable[nbSeq-1]);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
+    if (MEM_32bits()) BIT_flushBits(&blockStream);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]);
+    if (MEM_32bits()) BIT_flushBits(&blockStream);
+    if (longOffsets) {
+        U32 const ofBits = ofCodeTable[nbSeq-1];
+        unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+        if (extraBits) {
+            BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, extraBits);
+            BIT_flushBits(&blockStream);
+        }
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offBase >> extraBits,
+                    ofBits - extraBits);
+    } else {
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]);
+    }
+    BIT_flushBits(&blockStream);
+
+    {   size_t n;
+        for (n=nbSeq-2 ; n<nbSeq ; n--) {      /* intentional underflow */
+            BYTE const llCode = llCodeTable[n];
+            BYTE const ofCode = ofCodeTable[n];
+            BYTE const mlCode = mlCodeTable[n];
+            U32  const llBits = LL_bits[llCode];
+            U32  const ofBits = ofCode;
+            U32  const mlBits = ML_bits[mlCode];
+            DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u",
+                        (unsigned)sequences[n].litLength,
+                        (unsigned)sequences[n].mlBase + MINMATCH,
+                        (unsigned)sequences[n].offBase);
+                                                                            /* 32b*/  /* 64b*/
+                                                                            /* (7)*/  /* (7)*/
+            FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode);       /* 15 */  /* 15 */
+            FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode);      /* 24 */  /* 24 */
+            if (MEM_32bits()) BIT_flushBits(&blockStream);                  /* (7)*/
+            FSE_encodeSymbol(&blockStream, &stateLitLength, llCode);        /* 16 */  /* 33 */
+            if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog)))
+                BIT_flushBits(&blockStream);                                /* (7)*/
+            BIT_addBits(&blockStream, sequences[n].litLength, llBits);
+            if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
+            BIT_addBits(&blockStream, sequences[n].mlBase, mlBits);
+            if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream);
+            if (longOffsets) {
+                unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+                if (extraBits) {
+                    BIT_addBits(&blockStream, sequences[n].offBase, extraBits);
+                    BIT_flushBits(&blockStream);                            /* (7)*/
+                }
+                BIT_addBits(&blockStream, sequences[n].offBase >> extraBits,
+                            ofBits - extraBits);                            /* 31 */
+            } else {
+                BIT_addBits(&blockStream, sequences[n].offBase, ofBits);     /* 31 */
+            }
+            BIT_flushBits(&blockStream);                                    /* (7)*/
+            DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr));
+    }   }
+
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog);
+    FSE_flushCState(&blockStream, &stateMatchLength);
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog);
+    FSE_flushCState(&blockStream, &stateOffsetBits);
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog);
+    FSE_flushCState(&blockStream, &stateLitLength);
+
+    {   size_t const streamSize = BIT_closeCStream(&blockStream);
+        RETURN_ERROR_IF(streamSize==0, dstSize_tooSmall, "not enough space");
+        return streamSize;
+    }
+}
+
+static size_t
+ZSTD_encodeSequences_default(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                    CTable_MatchLength, mlCodeTable,
+                                    CTable_OffsetBits, ofCodeTable,
+                                    CTable_LitLength, llCodeTable,
+                                    sequences, nbSeq, longOffsets);
+}
+
+
+#if DYNAMIC_BMI2
+
+static BMI2_TARGET_ATTRIBUTE size_t
+ZSTD_encodeSequences_bmi2(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                    CTable_MatchLength, mlCodeTable,
+                                    CTable_OffsetBits, ofCodeTable,
+                                    CTable_LitLength, llCodeTable,
+                                    sequences, nbSeq, longOffsets);
+}
+
+#endif
+
+size_t ZSTD_encodeSequences(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2)
+{
+    DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity);
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return ZSTD_encodeSequences_bmi2(dst, dstCapacity,
+                                         CTable_MatchLength, mlCodeTable,
+                                         CTable_OffsetBits, ofCodeTable,
+                                         CTable_LitLength, llCodeTable,
+                                         sequences, nbSeq, longOffsets);
+    }
+#endif
+    (void)bmi2;
+    return ZSTD_encodeSequences_default(dst, dstCapacity,
+                                        CTable_MatchLength, mlCodeTable,
+                                        CTable_OffsetBits, ofCodeTable,
+                                        CTable_LitLength, llCodeTable,
+                                        sequences, nbSeq, longOffsets);
+}
--- a/third_party/zstd/lib/compress/zstd_compress_sequences.h
+++ b/third_party/zstd/lib/compress/zstd_compress_sequences.h
@ -0,0 +1,55 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_SEQUENCES_H
+#define ZSTD_COMPRESS_SEQUENCES_H
+
+#include "third_party/zstd/lib/common/fse.h" /* FSE_repeat, FSE_CTable */
+#include "third_party/zstd/lib/common/zstd_internal.h" /* symbolEncodingType_e, ZSTD_strategy */
+
+typedef enum {
+    ZSTD_defaultDisallowed = 0,
+    ZSTD_defaultAllowed = 1
+} ZSTD_defaultPolicy_e;
+
+symbolEncodingType_e
+ZSTD_selectEncodingType(
+        FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+        size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+        FSE_CTable const* prevCTable,
+        short const* defaultNorm, U32 defaultNormLog,
+        ZSTD_defaultPolicy_e const isDefaultAllowed,
+        ZSTD_strategy const strategy);
+
+size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+                unsigned* count, U32 max,
+                const BYTE* codeTable, size_t nbSeq,
+                const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                const FSE_CTable* prevCTable, size_t prevCTableSize,
+                void* entropyWorkspace, size_t entropyWorkspaceSize);
+
+size_t ZSTD_encodeSequences(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2);
+
+size_t ZSTD_fseBitCost(
+    FSE_CTable const* ctable,
+    unsigned const* count,
+    unsigned const max);
+
+size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+                             unsigned const* count, unsigned const max);
+#endif /* ZSTD_COMPRESS_SEQUENCES_H */
--- a/third_party/zstd/lib/compress/zstd_compress_superblock.c
+++ b/third_party/zstd/lib/compress/zstd_compress_superblock.c
@ -0,0 +1,578 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+#include "third_party/zstd/lib/compress/zstd_compress_superblock.h"
+
+#include "third_party/zstd/lib/common/zstd_internal.h"  /* ZSTD_getSequenceLength */
+#include "third_party/zstd/lib/compress/hist.h"                     /* HIST_countFast_wksp */
+#include "third_party/zstd/lib/compress/zstd_compress_internal.h"   /* ZSTD_[huf|fse|entropy]CTablesMetadata_t */
+#include "third_party/zstd/lib/compress/zstd_compress_sequences.h"
+#include "third_party/zstd/lib/compress/zstd_compress_literals.h"
+
+/** ZSTD_compressSubBlock_literal() :
+ *  Compresses literals section for a sub-block.
+ *  When we have to write the Huffman table we will sometimes choose a header
+ *  size larger than necessary. This is because we have to pick the header size
+ *  before we know the table size + compressed size, so we have a bound on the
+ *  table size. If we guessed incorrectly, we fall back to uncompressed literals.
+ *
+ *  We write the header when writeEntropy=1 and set entropyWritten=1 when we succeeded
+ *  in writing the header, otherwise it is set to 0.
+ *
+ *  hufMetadata->hType has literals block type info.
+ *      If it is set_basic, all sub-blocks literals section will be Raw_Literals_Block.
+ *      If it is set_rle, all sub-blocks literals section will be RLE_Literals_Block.
+ *      If it is set_compressed, first sub-block's literals section will be Compressed_Literals_Block
+ *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+ *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+ *  @return : compressed size of literals section of a sub-block
+ *            Or 0 if unable to compress.
+ *            Or error code */
+static size_t
+ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                              const BYTE* literals, size_t litSize,
+                              void* dst, size_t dstSize,
+                              const int bmi2, int writeEntropy, int* entropyWritten)
+{
+    size_t const header = writeEntropy ? 200 : 0;
+    size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart + lhSize;
+    U32 const singleStream = lhSize == 3;
+    symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+    size_t cLitSize = 0;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+
+    *entropyWritten = 0;
+    if (litSize == 0 || hufMetadata->hType == set_basic) {
+      DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal");
+      return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+    } else if (hufMetadata->hType == set_rle) {
+      DEBUGLOG(5, "ZSTD_compressSubBlock_literal using rle literal");
+      return ZSTD_compressRleLiteralsBlock(dst, dstSize, literals, litSize);
+    }
+
+    assert(litSize > 0);
+    assert(hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat);
+
+    if (writeEntropy && hufMetadata->hType == set_compressed) {
+        ZSTD_memcpy(op, hufMetadata->hufDesBuffer, hufMetadata->hufDesSize);
+        op += hufMetadata->hufDesSize;
+        cLitSize += hufMetadata->hufDesSize;
+        DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+    }
+
+    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
+        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags)
+                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags);
+        op += cSize;
+        cLitSize += cSize;
+        if (cSize == 0 || ERR_isError(cSize)) {
+            DEBUGLOG(5, "Failed to write entropy tables %s", ZSTD_getErrorName(cSize));
+            return 0;
+        }
+        /* If we expand and we aren't writing a header then emit uncompressed */
+        if (!writeEntropy && cLitSize >= litSize) {
+            DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal because uncompressible");
+            return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+        }
+        /* If we are writing headers then allow expansion that doesn't change our header size. */
+        if (lhSize < (size_t)(3 + (cLitSize >= 1 KB) + (cLitSize >= 16 KB))) {
+            assert(cLitSize > litSize);
+            DEBUGLOG(5, "Literals expanded beyond allowed header size");
+            return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+        }
+        DEBUGLOG(5, "ZSTD_compressSubBlock_literal (cSize=%zu)", cSize);
+    }
+
+    /* Build header */
+    switch(lhSize)
+    {
+    case 3: /* 2 - 2 - 10 - 10 */
+        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+            MEM_writeLE24(ostart, lhc);
+            break;
+        }
+    case 4: /* 2 - 2 - 14 - 14 */
+        {   U32 const lhc = hType + (2 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<18);
+            MEM_writeLE32(ostart, lhc);
+            break;
+        }
+    case 5: /* 2 - 2 - 18 - 18 */
+        {   U32 const lhc = hType + (3 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<22);
+            MEM_writeLE32(ostart, lhc);
+            ostart[4] = (BYTE)(cLitSize >> 10);
+            break;
+        }
+    default:  /* not possible : lhSize is {3,4,5} */
+        assert(0);
+    }
+    *entropyWritten = 1;
+    DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
+    return op-ostart;
+}
+
+static size_t
+ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
+                   const seqDef* sequences, size_t nbSeq,
+                         size_t litSize, int lastSequence)
+{
+    const seqDef* const sstart = sequences;
+    const seqDef* const send = sequences + nbSeq;
+    const seqDef* sp = sstart;
+    size_t matchLengthSum = 0;
+    size_t litLengthSum = 0;
+    (void)(litLengthSum); /* suppress unused variable warning on some environments */
+    while (send-sp > 0) {
+        ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
+        litLengthSum += seqLen.litLength;
+        matchLengthSum += seqLen.matchLength;
+        sp++;
+    }
+    assert(litLengthSum <= litSize);
+    if (!lastSequence) {
+        assert(litLengthSum == litSize);
+    }
+    return matchLengthSum + litSize;
+}
+
+/** ZSTD_compressSubBlock_sequences() :
+ *  Compresses sequences section for a sub-block.
+ *  fseMetadata->llType, fseMetadata->ofType, and fseMetadata->mlType have
+ *  symbol compression modes for the super-block.
+ *  The first successfully compressed block will have these in its header.
+ *  We set entropyWritten=1 when we succeed in compressing the sequences.
+ *  The following sub-blocks will always have repeat mode.
+ *  @return : compressed size of sequences section of a sub-block
+ *            Or 0 if it is unable to compress
+ *            Or error code. */
+static size_t
+ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                const seqDef* sequences, size_t nbSeq,
+                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                const ZSTD_CCtx_params* cctxParams,
+                                void* dst, size_t dstCapacity,
+                                const int bmi2, int writeEntropy, int* entropyWritten)
+{
+    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    BYTE* seqHead;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (nbSeq=%zu, writeEntropy=%d, longOffsets=%d)", nbSeq, writeEntropy, longOffsets);
+
+    *entropyWritten = 0;
+    /* Sequences Header */
+    RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                    dstSize_tooSmall, "");
+    if (nbSeq < 128)
+        *op++ = (BYTE)nbSeq;
+    else if (nbSeq < LONGNBSEQ)
+        op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+    else
+        op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+    if (nbSeq==0) {
+        return op - ostart;
+    }
+
+    /* seqHead : flags for FSE encoding type */
+    seqHead = op++;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (seqHeadSize=%u)", (unsigned)(op-ostart));
+
+    if (writeEntropy) {
+        const U32 LLtype = fseMetadata->llType;
+        const U32 Offtype = fseMetadata->ofType;
+        const U32 MLtype = fseMetadata->mlType;
+        DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (fseTablesSize=%zu)", fseMetadata->fseTablesSize);
+        *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+        ZSTD_memcpy(op, fseMetadata->fseTablesBuffer, fseMetadata->fseTablesSize);
+        op += fseMetadata->fseTablesSize;
+    } else {
+        const U32 repeat = set_repeat;
+        *seqHead = (BYTE)((repeat<<6) + (repeat<<4) + (repeat<<2));
+    }
+
+    {   size_t const bitstreamSize = ZSTD_encodeSequences(
+                                        op, oend - op,
+                                        fseTables->matchlengthCTable, mlCode,
+                                        fseTables->offcodeCTable, ofCode,
+                                        fseTables->litlengthCTable, llCode,
+                                        sequences, nbSeq,
+                                        longOffsets, bmi2);
+        FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed");
+        op += bitstreamSize;
+        /* zstd versions <= 1.3.4 mistakenly report corruption when
+         * FSE_readNCount() receives a buffer < 4 bytes.
+         * Fixed by https://github.com/facebook/zstd/pull/1146.
+         * This can happen when the last set_compressed table present is 2
+         * bytes and the bitstream is only one byte.
+         * In this exceedingly rare case, we will simply emit an uncompressed
+         * block, since it isn't worth optimizing.
+         */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+        if (writeEntropy && fseMetadata->lastCountSize && fseMetadata->lastCountSize + bitstreamSize < 4) {
+            /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+            assert(fseMetadata->lastCountSize + bitstreamSize == 3);
+            DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+                        "emitting an uncompressed block.");
+            return 0;
+        }
+#endif
+        DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (bitstreamSize=%zu)", bitstreamSize);
+    }
+
+    /* zstd versions <= 1.4.0 mistakenly report error when
+     * sequences section body size is less than 3 bytes.
+     * Fixed by https://github.com/facebook/zstd/pull/1664.
+     * This can happen when the previous sequences section block is compressed
+     * with rle mode and the current block's sequences section is compressed
+     * with repeat mode where sequences section body size can be 1 byte.
+     */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    if (op-seqHead < 4) {
+        DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.4.0 by emitting "
+                    "an uncompressed block when sequences are < 4 bytes");
+        return 0;
+    }
+#endif
+
+    *entropyWritten = 1;
+    return op - ostart;
+}
+
+/** ZSTD_compressSubBlock() :
+ *  Compresses a single sub-block.
+ *  @return : compressed size of the sub-block
+ *            Or 0 if it failed to compress. */
+static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                                    const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                    const seqDef* sequences, size_t nbSeq,
+                                    const BYTE* literals, size_t litSize,
+                                    const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                    const ZSTD_CCtx_params* cctxParams,
+                                    void* dst, size_t dstCapacity,
+                                    const int bmi2,
+                                    int writeLitEntropy, int writeSeqEntropy,
+                                    int* litEntropyWritten, int* seqEntropyWritten,
+                                    U32 lastBlock)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart + ZSTD_blockHeaderSize;
+    DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeLitEntropy=%d, writeSeqEntropy=%d, lastBlock=%d)",
+                litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
+    {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
+                                                        &entropyMetadata->hufMetadata, literals, litSize,
+                                                        op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
+        FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
+        if (cLitSize == 0) return 0;
+        op += cLitSize;
+    }
+    {   size_t cSeqSize = ZSTD_compressSubBlock_sequences(&entropy->fse,
+                                                  &entropyMetadata->fseMetadata,
+                                                  sequences, nbSeq,
+                                                  llCode, mlCode, ofCode,
+                                                  cctxParams,
+                                                  op, oend-op,
+                                                  bmi2, writeSeqEntropy, seqEntropyWritten);
+        FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
+        if (cSeqSize == 0) return 0;
+        op += cSeqSize;
+    }
+    /* Write block header */
+    {   size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
+        U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+        MEM_writeLE24(ostart, cBlockHeader24);
+    }
+    return op-ostart;
+}
+
+static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
+                                                const ZSTD_hufCTables_t* huf,
+                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                                                void* workspace, size_t wkspSize,
+                                                int writeEntropy)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    unsigned maxSymbolValue = 255;
+    size_t literalSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
+
+    if (hufMetadata->hType == set_basic) return litSize;
+    else if (hufMetadata->hType == set_rle) return 1;
+    else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) {
+        size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize);
+        if (ZSTD_isError(largest)) return litSize;
+        {   size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue);
+            if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize;
+            return cLitSizeEstimate + literalSectionHeaderSize;
+    }   }
+    assert(0); /* impossible */
+    return 0;
+}
+
+static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type,
+                        const BYTE* codeTable, unsigned maxCode,
+                        size_t nbSeq, const FSE_CTable* fseCTable,
+                        const U8* additionalBits,
+                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                        void* workspace, size_t wkspSize)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    const BYTE* ctp = codeTable;
+    const BYTE* const ctStart = ctp;
+    const BYTE* const ctEnd = ctStart + nbSeq;
+    size_t cSymbolTypeSizeEstimateInBits = 0;
+    unsigned max = maxCode;
+
+    HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize);  /* can't fail */
+    if (type == set_basic) {
+        /* We selected this encoding type, so it must be valid. */
+        assert(max <= defaultMax);
+        cSymbolTypeSizeEstimateInBits = max <= defaultMax
+                ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max)
+                : ERROR(GENERIC);
+    } else if (type == set_rle) {
+        cSymbolTypeSizeEstimateInBits = 0;
+    } else if (type == set_compressed || type == set_repeat) {
+        cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max);
+    }
+    if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) return nbSeq * 10;
+    while (ctp < ctEnd) {
+        if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp];
+        else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */
+        ctp++;
+    }
+    return cSymbolTypeSizeEstimateInBits / 8;
+}
+
+static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
+                                                  const BYTE* llCodeTable,
+                                                  const BYTE* mlCodeTable,
+                                                  size_t nbSeq,
+                                                  const ZSTD_fseCTables_t* fseTables,
+                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                                  void* workspace, size_t wkspSize,
+                                                  int writeEntropy)
+{
+    size_t const sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
+    size_t cSeqSizeEstimate = 0;
+    if (nbSeq == 0) return sequencesSectionHeaderSize;
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff,
+                                         nbSeq, fseTables->offcodeCTable, NULL,
+                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->llType, llCodeTable, MaxLL,
+                                         nbSeq, fseTables->litlengthCTable, LL_bits,
+                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, MaxML,
+                                         nbSeq, fseTables->matchlengthCTable, ML_bits,
+                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                         workspace, wkspSize);
+    if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+    return cSeqSizeEstimate + sequencesSectionHeaderSize;
+}
+
+static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                        const BYTE* ofCodeTable,
+                                        const BYTE* llCodeTable,
+                                        const BYTE* mlCodeTable,
+                                        size_t nbSeq,
+                                        const ZSTD_entropyCTables_t* entropy,
+                                        const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                        void* workspace, size_t wkspSize,
+                                        int writeLitEntropy, int writeSeqEntropy) {
+    size_t cSizeEstimate = 0;
+    cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
+                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+                                                         workspace, wkspSize, writeLitEntropy);
+    cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                                         workspace, wkspSize, writeSeqEntropy);
+    return cSizeEstimate + ZSTD_blockHeaderSize;
+}
+
+static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
+{
+    if (fseMetadata->llType == set_compressed || fseMetadata->llType == set_rle)
+        return 1;
+    if (fseMetadata->mlType == set_compressed || fseMetadata->mlType == set_rle)
+        return 1;
+    if (fseMetadata->ofType == set_compressed || fseMetadata->ofType == set_rle)
+        return 1;
+    return 0;
+}
+
+/** ZSTD_compressSubBlock_multi() :
+ *  Breaks super-block into multiple sub-blocks and compresses them.
+ *  Entropy will be written to the first block.
+ *  The following blocks will use repeat mode to compress.
+ *  All sub-blocks are compressed blocks (no raw or rle blocks).
+ *  @return : compressed size of the super block (which is multiple ZSTD blocks)
+ *            Or 0 if it failed to compress. */
+static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+                            const ZSTD_compressedBlockState_t* prevCBlock,
+                            ZSTD_compressedBlockState_t* nextCBlock,
+                            const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                            const ZSTD_CCtx_params* cctxParams,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const int bmi2, U32 lastBlock,
+                            void* workspace, size_t wkspSize)
+{
+    const seqDef* const sstart = seqStorePtr->sequencesStart;
+    const seqDef* const send = seqStorePtr->sequences;
+    const seqDef* sp = sstart;
+    const BYTE* const lstart = seqStorePtr->litStart;
+    const BYTE* const lend = seqStorePtr->lit;
+    const BYTE* lp = lstart;
+    BYTE const* ip = (BYTE const*)src;
+    BYTE const* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    const BYTE* llCodePtr = seqStorePtr->llCode;
+    const BYTE* mlCodePtr = seqStorePtr->mlCode;
+    const BYTE* ofCodePtr = seqStorePtr->ofCode;
+    size_t targetCBlockSize = cctxParams->targetCBlockSize;
+    size_t litSize, seqCount;
+    int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
+    int writeSeqEntropy = 1;
+    int lastSequence = 0;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
+                (unsigned)(lend-lp), (unsigned)(send-sstart));
+
+    litSize = 0;
+    seqCount = 0;
+    do {
+        size_t cBlockSizeEstimate = 0;
+        if (sstart == send) {
+            lastSequence = 1;
+        } else {
+            const seqDef* const sequence = sp + seqCount;
+            lastSequence = sequence == send - 1;
+            litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
+            seqCount++;
+        }
+        if (lastSequence) {
+            assert(lp <= lend);
+            assert(litSize <= (size_t)(lend - lp));
+            litSize = (size_t)(lend - lp);
+        }
+        /* I think there is an optimization opportunity here.
+         * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
+         * since it recalculates estimate from scratch.
+         * For example, it would recount literal distribution and symbol codes every time.
+         */
+        cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
+                                                       &nextCBlock->entropy, entropyMetadata,
+                                                       workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
+        if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
+            int litEntropyWritten = 0;
+            int seqEntropyWritten = 0;
+            const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
+            const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+                                                       sp, seqCount,
+                                                       lp, litSize,
+                                                       llCodePtr, mlCodePtr, ofCodePtr,
+                                                       cctxParams,
+                                                       op, oend-op,
+                                                       bmi2, writeLitEntropy, writeSeqEntropy,
+                                                       &litEntropyWritten, &seqEntropyWritten,
+                                                       lastBlock && lastSequence);
+            FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+            if (cSize > 0 && cSize < decompressedSize) {
+                DEBUGLOG(5, "Committed the sub-block");
+                assert(ip + decompressedSize <= iend);
+                ip += decompressedSize;
+                sp += seqCount;
+                lp += litSize;
+                op += cSize;
+                llCodePtr += seqCount;
+                mlCodePtr += seqCount;
+                ofCodePtr += seqCount;
+                litSize = 0;
+                seqCount = 0;
+                /* Entropy only needs to be written once */
+                if (litEntropyWritten) {
+                    writeLitEntropy = 0;
+                }
+                if (seqEntropyWritten) {
+                    writeSeqEntropy = 0;
+                }
+            }
+        }
+    } while (!lastSequence);
+    if (writeLitEntropy) {
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
+        ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
+    }
+    if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
+        /* If we haven't written our entropy tables, then we've violated our contract and
+         * must emit an uncompressed block.
+         */
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
+        return 0;
+    }
+    if (ip < iend) {
+        size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
+        FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+        assert(cSize != 0);
+        op += cSize;
+        /* We have to regenerate the repcodes because we've skipped some sequences */
+        if (sp < send) {
+            seqDef const* seq;
+            repcodes_t rep;
+            ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
+            for (seq = sstart; seq < sp; ++seq) {
+                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+            }
+            ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
+        }
+    }
+    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
+    return op-ostart;
+}
+
+size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               void const* src, size_t srcSize,
+                               unsigned lastBlock) {
+    ZSTD_entropyCTablesMetadata_t entropyMetadata;
+
+    FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
+          &zc->blockState.prevCBlock->entropy,
+          &zc->blockState.nextCBlock->entropy,
+          &zc->appliedParams,
+          &entropyMetadata,
+          zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+
+    return ZSTD_compressSubBlock_multi(&zc->seqStore,
+            zc->blockState.prevCBlock,
+            zc->blockState.nextCBlock,
+            &entropyMetadata,
+            &zc->appliedParams,
+            dst, dstCapacity,
+            src, srcSize,
+            zc->bmi2, lastBlock,
+            zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */);
+}
--- a/third_party/zstd/lib/compress/zstd_compress_superblock.h
+++ b/third_party/zstd/lib/compress/zstd_compress_superblock.h
@ -0,0 +1,33 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_ADVANCED_H
+#define ZSTD_COMPRESS_ADVANCED_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+
+#include "third_party/zstd/zstd.h" /* ZSTD_CCtx */
+
+/*-*************************************
+*  Target Compressed Block Size
+***************************************/
+
+/* ZSTD_compressSuperBlock() :
+ * Used to compress a super block when targetCBlockSize is being used.
+ * The given block will be compressed into multiple sub blocks that are around targetCBlockSize. */
+size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               void const* src, size_t srcSize,
+                               unsigned lastBlock);
+
+#endif /* ZSTD_COMPRESS_ADVANCED_H */
--- a/third_party/zstd/lib/compress/zstd_cwksp.h
+++ b/third_party/zstd/lib/compress/zstd_cwksp.h
@ -0,0 +1,744 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CWKSP_H
+#define ZSTD_CWKSP_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "third_party/zstd/lib/common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+#include "third_party/zstd/lib/common/zstd_internal.h"
+#include "third_party/zstd/lib/common/portability_macros.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+*  Constants
+***************************************/
+
+/* Since the workspace is effectively its own little malloc implementation /
+ * arena, when we run under ASAN, we should similarly insert redzones between
+ * each internal element of the workspace, so ASAN will catch overruns that
+ * reach outside an object but that stay inside the workspace.
+ *
+ * This defines the size of that redzone.
+ */
+#ifndef ZSTD_CWKSP_ASAN_REDZONE_SIZE
+#define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128
+#endif
+
+
+/* Set our tables and aligneds to align by 64 bytes */
+#define ZSTD_CWKSP_ALIGNMENT_BYTES 64
+
+/*-*************************************
+*  Structures
+***************************************/
+typedef enum {
+    ZSTD_cwksp_alloc_objects,
+    ZSTD_cwksp_alloc_aligned_init_once,
+    ZSTD_cwksp_alloc_aligned,
+    ZSTD_cwksp_alloc_buffers
+} ZSTD_cwksp_alloc_phase_e;
+
+/**
+ * Used to describe whether the workspace is statically allocated (and will not
+ * necessarily ever be freed), or if it's dynamically allocated and we can
+ * expect a well-formed caller to free this.
+ */
+typedef enum {
+    ZSTD_cwksp_dynamic_alloc,
+    ZSTD_cwksp_static_alloc
+} ZSTD_cwksp_static_alloc_e;
+
+/**
+ * Zstd fits all its internal datastructures into a single continuous buffer,
+ * so that it only needs to perform a single OS allocation (or so that a buffer
+ * can be provided to it and it can perform no allocations at all). This buffer
+ * is called the workspace.
+ *
+ * Several optimizations complicate that process of allocating memory ranges
+ * from this workspace for each internal datastructure:
+ *
+ * - These different internal datastructures have different setup requirements:
+ *
+ *   - The static objects need to be cleared once and can then be trivially
+ *     reused for each compression.
+ *
+ *   - Various buffers don't need to be initialized at all--they are always
+ *     written into before they're read.
+ *
+ *   - The matchstate tables have a unique requirement that they don't need
+ *     their memory to be totally cleared, but they do need the memory to have
+ *     some bound, i.e., a guarantee that all values in the memory they've been
+ *     allocated is less than some maximum value (which is the starting value
+ *     for the indices that they will then use for compression). When this
+ *     guarantee is provided to them, they can use the memory without any setup
+ *     work. When it can't, they have to clear the area.
+ *
+ * - These buffers also have different alignment requirements.
+ *
+ * - We would like to reuse the objects in the workspace for multiple
+ *   compressions without having to perform any expensive reallocation or
+ *   reinitialization work.
+ *
+ * - We would like to be able to efficiently reuse the workspace across
+ *   multiple compressions **even when the compression parameters change** and
+ *   we need to resize some of the objects (where possible).
+ *
+ * To attempt to manage this buffer, given these constraints, the ZSTD_cwksp
+ * abstraction was created. It works as follows:
+ *
+ * Workspace Layout:
+ *
+ * [                        ... workspace ...                           ]
+ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
+ *
+ * The various objects that live in the workspace are divided into the
+ * following categories, and are allocated separately:
+ *
+ * - Static objects: this is optionally the enclosing ZSTD_CCtx or ZSTD_CDict,
+ *   so that literally everything fits in a single buffer. Note: if present,
+ *   this must be the first object in the workspace, since ZSTD_customFree{CCtx,
+ *   CDict}() rely on a pointer comparison to see whether one or two frees are
+ *   required.
+ *
+ * - Fixed size objects: these are fixed-size, fixed-count objects that are
+ *   nonetheless "dynamically" allocated in the workspace so that we can
+ *   control how they're initialized separately from the broader ZSTD_CCtx.
+ *   Examples:
+ *   - Entropy Workspace
+ *   - 2 x ZSTD_compressedBlockState_t
+ *   - CDict dictionary contents
+ *
+ * - Tables: these are any of several different datastructures (hash tables,
+ *   chain tables, binary trees) that all respect a common format: they are
+ *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+ *   Their sizes depend on the cparams. These tables are 64-byte aligned.
+ *
+ * - Init once: these buffers require to be initialized at least once before
+ *   use. They should be used when we want to skip memory initialization
+ *   while not triggering memory checkers (like Valgrind) when reading from
+ *   from this memory without writing to it first.
+ *   These buffers should be used carefully as they might contain data
+ *   from previous compressions.
+ *   Buffers are aligned to 64 bytes.
+ *
+ * - Aligned: these buffers don't require any initialization before they're
+ *   used. The user of the buffer should make sure they write into a buffer
+ *   location before reading from it.
+ *   Buffers are aligned to 64 bytes.
+ *
+ * - Buffers: these buffers are used for various purposes that don't require
+ *   any alignment or initialization before they're used. This means they can
+ *   be moved around at no cost for a new compression.
+ *
+ * Allocating Memory:
+ *
+ * The various types of objects must be allocated in order, so they can be
+ * correctly packed into the workspace buffer. That order is:
+ *
+ * 1. Objects
+ * 2. Init once / Tables
+ * 3. Aligned / Tables
+ * 4. Buffers / Tables
+ *
+ * Attempts to reserve objects of different types out of order will fail.
+ */
+typedef struct {
+    void* workspace;
+    void* workspaceEnd;
+
+    void* objectEnd;
+    void* tableEnd;
+    void* tableValidEnd;
+    void* allocStart;
+    void* initOnceStart;
+
+    BYTE allocFailed;
+    int workspaceOversizedDuration;
+    ZSTD_cwksp_alloc_phase_e phase;
+    ZSTD_cwksp_static_alloc_e isStatic;
+} ZSTD_cwksp;
+
+/*-*************************************
+*  Functions
+***************************************/
+
+MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
+MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
+
+MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+    (void)ws;
+    assert(ws->workspace <= ws->objectEnd);
+    assert(ws->objectEnd <= ws->tableEnd);
+    assert(ws->objectEnd <= ws->tableValidEnd);
+    assert(ws->tableEnd <= ws->allocStart);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    assert(ws->allocStart <= ws->workspaceEnd);
+    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
+    assert(ws->workspace <= ws->initOnceStart);
+#if ZSTD_MEMORY_SANITIZER
+    {
+        intptr_t const offset = __msan_test_shadow(ws->initOnceStart,
+            (U8*)ZSTD_cwksp_initialAllocStart(ws) - (U8*)ws->initOnceStart);
+        (void)offset;
+#if defined(ZSTD_MSAN_PRINT)
+        if(offset!=-1) {
+            __msan_print_shadow((U8*)ws->initOnceStart + offset - 8, 32);
+        }
+#endif
+        assert(offset==-1);
+    };
+#endif
+}
+
+/**
+ * Align must be a power of 2.
+ */
+MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {
+    size_t const mask = align - 1;
+    assert((align & mask) == 0);
+    return (size + mask) & ~mask;
+}
+
+/**
+ * Use this to determine how much space in the workspace we will consume to
+ * allocate this object. (Normally it should be exactly the size of the object,
+ * but under special conditions, like ASAN, where we pad each object, it might
+ * be larger.)
+ *
+ * Since tables aren't currently redzoned, you don't need to call through this
+ * to figure out how much space you need for the matchState tables. Everything
+ * else is though.
+ *
+ * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size().
+ */
+MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
+    if (size == 0)
+        return 0;
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    return size + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#else
+    return size;
+#endif
+}
+
+/**
+ * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes.
+ * Used to determine the number of bytes required for a given "aligned".
+ */
+MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+    return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES));
+}
+
+/**
+ * Returns the amount of additional space the cwksp must allocate
+ * for internal purposes (currently only alignment).
+ */
+MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
+     * bytes to align the beginning of tables section and end of buffers;
+     */
+    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
+    return slackSpace;
+}
+
+
+/**
+ * Return the number of additional bytes required to align a pointer to the given number of bytes.
+ * alignBytes must be a power of two.
+ */
+MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) {
+    size_t const alignBytesMask = alignBytes - 1;
+    size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
+    assert((alignBytes & alignBytesMask) == 0);
+    assert(bytes < alignBytes);
+    return bytes;
+}
+
+/**
+ * Returns the initial value for allocStart which is used to determine the position from
+ * which we can allocate from the end of the workspace.
+ */
+MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
+    return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
+}
+
+/**
+ * Internal function. Do not use directly.
+ * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
+ * which counts from the end of the wksp (as opposed to the object/table segment).
+ *
+ * Returns a pointer to the beginning of that space.
+ */
+MEM_STATIC void*
+ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes)
+{
+    void* const alloc = (BYTE*)ws->allocStart - bytes;
+    void* const bottom = ws->tableEnd;
+    DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
+        alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    assert(alloc >= bottom);
+    if (alloc < bottom) {
+        DEBUGLOG(4, "cwksp: alloc failed!");
+        ws->allocFailed = 1;
+        return NULL;
+    }
+    /* the area is reserved from the end of wksp.
+     * If it overlaps with tableValidEnd, it voids guarantees on values' range */
+    if (alloc < ws->tableValidEnd) {
+        ws->tableValidEnd = alloc;
+    }
+    ws->allocStart = alloc;
+    return alloc;
+}
+
+/**
+ * Moves the cwksp to the next phase, and does any necessary allocations.
+ * cwksp initialization must necessarily go through each phase in order.
+ * Returns a 0 on success, or zstd error
+ */
+MEM_STATIC size_t
+ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase)
+{
+    assert(phase >= ws->phase);
+    if (phase > ws->phase) {
+        /* Going from allocating objects to allocating initOnce / tables */
+        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
+            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
+            ws->tableValidEnd = ws->objectEnd;
+            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+
+            {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
+                void *const alloc = ws->objectEnd;
+                size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
+                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
+                DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
+                RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
+                                "table phase - alignment initial allocation failed!");
+                ws->objectEnd = objectEnd;
+                ws->tableEnd = objectEnd;  /* table area starts being empty */
+                if (ws->tableValidEnd < ws->tableEnd) {
+                    ws->tableValidEnd = ws->tableEnd;
+                }
+            }
+        }
+        ws->phase = phase;
+        ZSTD_cwksp_assert_internal_consistency(ws);
+    }
+    return 0;
+}
+
+/**
+ * Returns whether this object/buffer/etc was allocated in this workspace.
+ */
+MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
+{
+    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
+}
+
+/**
+ * Internal function. Do not use directly.
+ */
+MEM_STATIC void*
+ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase)
+{
+    void* alloc;
+    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || bytes == 0) {
+        return NULL;
+    }
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* over-reserve space */
+    bytes += 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#endif
+
+    alloc = ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes);
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
+     * either size. */
+    if (alloc) {
+        alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+        if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+            /* We need to keep the redzone poisoned while unpoisoning the bytes that
+             * are actually allocated. */
+            __asan_unpoison_memory_region(alloc, bytes - 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE);
+        }
+    }
+#endif
+
+    return alloc;
+}
+
+/**
+ * Reserves and returns unaligned memory.
+ */
+MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
+{
+    return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
+}
+
+/**
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+ * This memory has been initialized at least once in the past.
+ * This doesn't mean it has been initialized this time, and it might contain data from previous
+ * operations.
+ * The main usage is for algorithms that might need read access into uninitialized memory.
+ * The algorithm must maintain safety under these conditions and must make sure it doesn't
+ * leak any of the past data (directly or in side channels).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
+{
+    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
+    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
+    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
+    if(ptr && ptr < ws->initOnceStart) {
+        /* We assume the memory following the current allocation is either:
+         * 1. Not usable as initOnce memory (end of workspace)
+         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
+         * 3. An ASAN redzone, in which case we don't want to write on it
+         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
+         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
+        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
+        ws->initOnceStart = ptr;
+    }
+#if ZSTD_MEMORY_SANITIZER
+    assert(__msan_test_shadow(ptr, bytes) == -1);
+#endif
+    return ptr;
+}
+
+/**
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+{
+    void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES),
+                                            ZSTD_cwksp_alloc_aligned);
+    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
+    return ptr;
+}
+
+/**
+ * Aligned on 64 bytes. These buffers have the special property that
+ * their values remain constrained, allowing us to re-use them without
+ * memset()-ing them.
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+{
+    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
+    void* alloc;
+    void* end;
+    void* top;
+
+    /* We can only start allocating tables after we are done reserving space for objects at the
+     * start of the workspace */
+    if(ws->phase < phase) {
+        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+            return NULL;
+        }
+    }
+    alloc = ws->tableEnd;
+    end = (BYTE *)alloc + bytes;
+    top = ws->allocStart;
+
+    DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining",
+        alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+    assert((bytes & (sizeof(U32)-1)) == 0);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    assert(end <= top);
+    if (end > top) {
+        DEBUGLOG(4, "cwksp: table alloc failed!");
+        ws->allocFailed = 1;
+        return NULL;
+    }
+    ws->tableEnd = end;
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+        __asan_unpoison_memory_region(alloc, bytes);
+    }
+#endif
+
+    assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
+    assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
+    return alloc;
+}
+
+/**
+ * Aligned on sizeof(void*).
+ * Note : should happen only once, at workspace first initialization
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes)
+{
+    size_t const roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*));
+    void* alloc = ws->objectEnd;
+    void* end = (BYTE*)alloc + roundedBytes;
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* over-reserve space */
+    end = (BYTE *)end + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#endif
+
+    DEBUGLOG(4,
+        "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining",
+        alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes);
+    assert((size_t)alloc % ZSTD_ALIGNOF(void*) == 0);
+    assert(bytes % ZSTD_ALIGNOF(void*) == 0);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    /* we must be in the first phase, no advance is possible */
+    if (ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) {
+        DEBUGLOG(3, "cwksp: object alloc failed!");
+        ws->allocFailed = 1;
+        return NULL;
+    }
+    ws->objectEnd = end;
+    ws->tableEnd = end;
+    ws->tableValidEnd = end;
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
+     * either size. */
+    alloc = (BYTE*)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+    if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+        __asan_unpoison_memory_region(alloc, bytes);
+    }
+#endif
+
+    return alloc;
+}
+
+MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws)
+{
+    DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty");
+
+#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+    /* To validate that the table re-use logic is sound, and that we don't
+     * access table space that we haven't cleaned, we re-"poison" the table
+     * space every time we mark it dirty.
+     * Since tableValidEnd space and initOnce space may overlap we don't poison
+     * the initOnce portion as it break its promise. This means that this poisoning
+     * check isn't always applied fully. */
+    {
+        size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd;
+        assert(__msan_test_shadow(ws->objectEnd, size) == -1);
+        if((BYTE*)ws->tableValidEnd < (BYTE*)ws->initOnceStart) {
+            __msan_poison(ws->objectEnd, size);
+        } else {
+            assert(ws->initOnceStart >= ws->objectEnd);
+            __msan_poison(ws->objectEnd, (BYTE*)ws->initOnceStart - (BYTE*)ws->objectEnd);
+        }
+    }
+#endif
+
+    assert(ws->tableValidEnd >= ws->objectEnd);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    ws->tableValidEnd = ws->objectEnd;
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+MEM_STATIC void ZSTD_cwksp_mark_tables_clean(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_clean");
+    assert(ws->tableValidEnd >= ws->objectEnd);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    if (ws->tableValidEnd < ws->tableEnd) {
+        ws->tableValidEnd = ws->tableEnd;
+    }
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * Zero the part of the allocated tables not already marked clean.
+ */
+MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: ZSTD_cwksp_clean_tables");
+    assert(ws->tableValidEnd >= ws->objectEnd);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    if (ws->tableValidEnd < ws->tableEnd) {
+        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
+    }
+    ZSTD_cwksp_mark_tables_clean(ws);
+}
+
+/**
+ * Invalidates table allocations.
+ * All other allocations remain valid.
+ */
+MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: clearing tables!");
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* We don't do this when the workspace is statically allocated, because
+     * when that is the case, we have no capability to hook into the end of the
+     * workspace's lifecycle to unpoison the memory.
+     */
+    if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+        size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd;
+        __asan_poison_memory_region(ws->objectEnd, size);
+    }
+#endif
+
+    ws->tableEnd = ws->objectEnd;
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * Invalidates all buffer, aligned, and table allocations.
+ * Object allocations remain valid.
+ */
+MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: clearing!");
+
+#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+    /* To validate that the context re-use logic is sound, and that we don't
+     * access stuff that this compression hasn't initialized, we re-"poison"
+     * the workspace except for the areas in which we expect memory re-use
+     * without initialization (objects, valid tables area and init once
+     * memory). */
+    {
+        if((BYTE*)ws->tableValidEnd < (BYTE*)ws->initOnceStart) {
+            size_t size = (BYTE*)ws->initOnceStart - (BYTE*)ws->tableValidEnd;
+            __msan_poison(ws->tableValidEnd, size);
+        }
+    }
+#endif
+
+#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* We don't do this when the workspace is statically allocated, because
+     * when that is the case, we have no capability to hook into the end of the
+     * workspace's lifecycle to unpoison the memory.
+     */
+    if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
+        size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->objectEnd;
+        __asan_poison_memory_region(ws->objectEnd, size);
+    }
+#endif
+
+    ws->tableEnd = ws->objectEnd;
+    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
+    ws->allocFailed = 0;
+    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
+        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
+    }
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * The provided workspace takes ownership of the buffer [start, start+size).
+ * Any existing values in the workspace are ignored (the previously managed
+ * buffer, if present, must be separately freed).
+ */
+MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_cwksp_static_alloc_e isStatic) {
+    DEBUGLOG(4, "cwksp: init'ing workspace with %zd bytes", size);
+    assert(((size_t)start & (sizeof(void*)-1)) == 0); /* ensure correct alignment */
+    ws->workspace = start;
+    ws->workspaceEnd = (BYTE*)start + size;
+    ws->objectEnd = ws->workspace;
+    ws->tableValidEnd = ws->objectEnd;
+    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+    ws->phase = ZSTD_cwksp_alloc_objects;
+    ws->isStatic = isStatic;
+    ZSTD_cwksp_clear(ws);
+    ws->workspaceOversizedDuration = 0;
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+MEM_STATIC size_t ZSTD_cwksp_create(ZSTD_cwksp* ws, size_t size, ZSTD_customMem customMem) {
+    void* workspace = ZSTD_customMalloc(size, customMem);
+    DEBUGLOG(4, "cwksp: creating new workspace with %zd bytes", size);
+    RETURN_ERROR_IF(workspace == NULL, memory_allocation, "NULL pointer!");
+    ZSTD_cwksp_init(ws, workspace, size, ZSTD_cwksp_dynamic_alloc);
+    return 0;
+}
+
+MEM_STATIC void ZSTD_cwksp_free(ZSTD_cwksp* ws, ZSTD_customMem customMem) {
+    void *ptr = ws->workspace;
+    DEBUGLOG(4, "cwksp: freeing workspace");
+    ZSTD_memset(ws, 0, sizeof(ZSTD_cwksp));
+    ZSTD_customFree(ptr, customMem);
+}
+
+/**
+ * Moves the management of a workspace from one cwksp to another. The src cwksp
+ * is left in an invalid state (src must be re-init()'ed before it's used again).
+ */
+MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
+    *dst = *src;
+    ZSTD_memset(src, 0, sizeof(ZSTD_cwksp));
+}
+
+MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
+}
+
+MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
+         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
+}
+
+MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+    return ws->allocFailed;
+}
+
+/*-*************************************
+*  Functions Checking Free Space
+***************************************/
+
+/* ZSTD_alignmentSpaceWithinBounds() :
+ * Returns if the estimated space needed for a wksp is within an acceptable limit of the
+ * actual amount of space used.
+ */
+MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
+    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
+     * the alignment bytes difference between estimation and actual usage */
+    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
+           ZSTD_cwksp_used(ws) <= estimatedSpace;
+}
+
+
+MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd);
+}
+
+MEM_STATIC int ZSTD_cwksp_check_available(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    return ZSTD_cwksp_available_space(ws) >= additionalNeededSpace;
+}
+
+MEM_STATIC int ZSTD_cwksp_check_too_large(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    return ZSTD_cwksp_check_available(
+        ws, additionalNeededSpace * ZSTD_WORKSPACETOOLARGE_FACTOR);
+}
+
+MEM_STATIC int ZSTD_cwksp_check_wasteful(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    return ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)
+        && ws->workspaceOversizedDuration > ZSTD_WORKSPACETOOLARGE_MAXDURATION;
+}
+
+MEM_STATIC void ZSTD_cwksp_bump_oversized_duration(
+        ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    if (ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)) {
+        ws->workspaceOversizedDuration++;
+    } else {
+        ws->workspaceOversizedDuration = 0;
+    }
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_CWKSP_H */
--- a/third_party/zstd/lib/compress/zstd_double_fast.c
+++ b/third_party/zstd/lib/compress/zstd_double_fast.c
@ -0,0 +1,763 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "third_party/zstd/lib/compress/zstd_compress_internal.h"
+#include "third_party/zstd/lib/compress/zstd_double_fast.h"
+
+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+
+static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0) {
+                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
+            }
+            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
+                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
+            }
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+    }   }
+}
+
+static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32  const mls = cParams->minMatch;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0)
+                hashSmall[smHash] = curr + i;
+            if (i == 0 || hashLarge[lgHash] == 0)
+                hashLarge[lgHash] = curr + i;
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+        }   }
+}
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize, U32 const mls /* template */)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32* const hashLong = ms->hashTable;
+    const U32 hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    const U32 hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* anchor = istart;
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+    /* presumes that, if there is a dictionary, it must be using Attach mode */
+    const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    size_t mLength;
+    U32 offset;
+    U32 curr;
+
+    /* how many positions to search before increasing step size */
+    const size_t kStepIncr = 1 << kSearchStrength;
+    /* the position at which to increment the step size if no match is found */
+    const BYTE* nextStep;
+    size_t step; /* the current step size */
+
+    size_t hl0; /* the long hash at ip */
+    size_t hl1; /* the long hash at ip1 */
+
+    U32 idxl0; /* the long match index for ip */
+    U32 idxl1; /* the long match index for ip1 */
+
+    const BYTE* matchl0; /* the long match for ip */
+    const BYTE* matchs0; /* the short match for ip */
+    const BYTE* matchl1; /* the long match for ip1 */
+
+    const BYTE* ip = istart; /* the current position */
+    const BYTE* ip1; /* the next position */
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic");
+
+    /* init */
+    ip += ((ip - prefixLowest) == 0);
+    {
+        U32 const current = (U32)(ip - base);
+        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+        U32 const maxRep = current - windowLow;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+    }
+
+    /* Outer Loop: one iteration per match found and stored */
+    while (1) {
+        step = 1;
+        nextStep = ip + kStepIncr;
+        ip1 = ip + step;
+
+        if (ip1 > ilimit) {
+            goto _cleanup;
+        }
+
+        hl0 = ZSTD_hashPtr(ip, hBitsL, 8);
+        idxl0 = hashLong[hl0];
+        matchl0 = base + idxl0;
+
+        /* Inner Loop: one iteration per search / position */
+        do {
+            const size_t hs0 = ZSTD_hashPtr(ip, hBitsS, mls);
+            const U32 idxs0 = hashSmall[hs0];
+            curr = (U32)(ip-base);
+            matchs0 = base + idxs0;
+
+            hashLong[hl0] = hashSmall[hs0] = curr;   /* update hash tables */
+
+            /* check noDict repcode */
+            if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+                mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+                ip++;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                goto _match_stored;
+            }
+
+            hl1 = ZSTD_hashPtr(ip1, hBitsL, 8);
+
+            if (idxl0 > prefixLowestIndex) {
+                /* check prefix long match */
+                if (MEM_read64(matchl0) == MEM_read64(ip)) {
+                    mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8;
+                    offset = (U32)(ip-matchl0);
+                    while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */
+                    goto _match_found;
+                }
+            }
+
+            idxl1 = hashLong[hl1];
+            matchl1 = base + idxl1;
+
+            if (idxs0 > prefixLowestIndex) {
+                /* check prefix short match */
+                if (MEM_read32(matchs0) == MEM_read32(ip)) {
+                    goto _search_next_long;
+                }
+            }
+
+            if (ip1 >= nextStep) {
+                PREFETCH_L1(ip1 + 64);
+                PREFETCH_L1(ip1 + 128);
+                step++;
+                nextStep += kStepIncr;
+            }
+            ip = ip1;
+            ip1 += step;
+
+            hl0 = hl1;
+            idxl0 = idxl1;
+            matchl0 = matchl1;
+    #if defined(__aarch64__)
+            PREFETCH_L1(ip+256);
+    #endif
+        } while (ip1 <= ilimit);
+
+_cleanup:
+        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+        /* save reps for next block */
+        rep[0] = offset_1 ? offset_1 : offsetSaved1;
+        rep[1] = offset_2 ? offset_2 : offsetSaved2;
+
+        /* Return the last literals size */
+        return (size_t)(iend - anchor);
+
+_search_next_long:
+
+        /* check prefix long +1 match */
+        if (idxl1 > prefixLowestIndex) {
+            if (MEM_read64(matchl1) == MEM_read64(ip1)) {
+                ip = ip1;
+                mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8;
+                offset = (U32)(ip-matchl1);
+                while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        }
+
+        /* if no long +1 match, explore the short match we found */
+        mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4;
+        offset = (U32)(ip - matchs0);
+        while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */
+
+        /* fall-through */
+
+_match_found: /* requires ip, offset, mLength */
+        offset_2 = offset_1;
+        offset_1 = offset;
+
+        if (step < 4) {
+            /* It is unsafe to write this value back to the hashtable when ip1 is
+             * greater than or equal to the new ip we will have after we're done
+             * processing this match. Rather than perform that test directly
+             * (ip1 >= ip + mLength), which costs speed in practice, we do a simpler
+             * more predictable test. The minmatch even if we take a short match is
+             * 4 bytes, so as long as step, the distance between ip and ip1
+             * (initially) is less than 4, we know ip1 < new ip. */
+            hashLong[hl1] = (U32)(ip1 - base);
+        }
+
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+_match_stored:
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Complementary insertion */
+            /* done after iLimit test, as candidates could be > iend-8 */
+            {   U32 const indexToInsert = curr+2;
+                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+            }
+
+            /* check immediate repcode */
+            while ( (ip <= ilimit)
+                 && ( (offset_2>0)
+                    & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                /* store sequence */
+                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                ip += rLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+            }
+        }
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const mls /* template */)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32* const hashLong = ms->hashTable;
+    const U32 hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    const U32 hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+    /* presumes that, if there is a dictionary, it must be using Attach mode */
+    const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
+    const U32* const dictHashLong  = dms->hashTable;
+    const U32* const dictHashSmall = dms->chainTable;
+    const U32 dictStartIndex       = dms->window.dictLimit;
+    const BYTE* const dictBase     = dms->window.base;
+    const BYTE* const dictStart    = dictBase + dictStartIndex;
+    const BYTE* const dictEnd      = dms->window.nextSrc;
+    const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
+    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
+
+    /* if a dictionary is attached, it must be within window range */
+    assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashLong, hashTableBytes)
+        PREFETCH_AREA(dictHashSmall, chainTableBytes)
+    }
+
+    /* init */
+    ip += (dictAndPrefixLength == 0);
+
+    /* dictMatchState repCode checks don't currently handle repCode == 0
+     * disabling. */
+    assert(offset_1 <= dictAndPrefixLength);
+    assert(offset_2 <= dictAndPrefixLength);
+
+    /* Main Search Loop */
+    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+        size_t mLength;
+        U32 offset;
+        size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+        size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
+        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
+        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
+        U32 const curr = (U32)(ip-base);
+        U32 const matchIndexL = hashLong[h2];
+        U32 matchIndexS = hashSmall[h];
+        const BYTE* matchLong = base + matchIndexL;
+        const BYTE* match = base + matchIndexS;
+        const U32 repIndex = curr + 1 - offset_1;
+        const BYTE* repMatch = (repIndex < prefixLowestIndex) ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+        hashLong[h2] = hashSmall[h] = curr;   /* update hash tables */
+
+        /* check repcode */
+        if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+            && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+            goto _match_stored;
+        }
+
+        if (matchIndexL > prefixLowestIndex) {
+            /* check prefix long match */
+            if (MEM_read64(matchLong) == MEM_read64(ip)) {
+                mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
+                offset = (U32)(ip-matchLong);
+                while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        } else if (dictTagsMatchL) {
+            /* check dictMatchState long match */
+            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
+            const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+            assert(dictMatchL < dictEnd);
+
+            if (dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) {
+                mLength = ZSTD_count_2segments(ip+8, dictMatchL+8, iend, dictEnd, prefixLowest) + 8;
+                offset = (U32)(curr - dictMatchIndexL - dictIndexDelta);
+                while (((ip>anchor) & (dictMatchL>dictStart)) && (ip[-1] == dictMatchL[-1])) { ip--; dictMatchL--; mLength++; } /* catch up */
+                goto _match_found;
+        }   }
+
+        if (matchIndexS > prefixLowestIndex) {
+            /* check prefix short match */
+            if (MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+            }
+        } else if (dictTagsMatchS) {
+            /* check dictMatchState short match */
+            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
+            match = dictBase + dictMatchIndexS;
+            matchIndexS = dictMatchIndexS + dictIndexDelta;
+
+            if (match > dictStart && MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+        }   }
+
+        ip += ((ip-anchor) >> kSearchStrength) + 1;
+#if defined(__aarch64__)
+        PREFETCH_L1(ip+256);
+#endif
+        continue;
+
+_search_next_long:
+        {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+            U32 const matchIndexL3 = hashLong[hl3];
+            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
+            const BYTE* matchL3 = base + matchIndexL3;
+            hashLong[hl3] = curr + 1;
+
+            /* check prefix long +1 match */
+            if (matchIndexL3 > prefixLowestIndex) {
+                if (MEM_read64(matchL3) == MEM_read64(ip+1)) {
+                    mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
+                    ip++;
+                    offset = (U32)(ip-matchL3);
+                    while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                    goto _match_found;
+                }
+            } else if (dictTagsMatchL3) {
+                /* check dict long +1 match */
+                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
+                const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                assert(dictMatchL3 < dictEnd);
+                if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+                    mLength = ZSTD_count_2segments(ip+1+8, dictMatchL3+8, iend, dictEnd, prefixLowest) + 8;
+                    ip++;
+                    offset = (U32)(curr + 1 - dictMatchIndexL3 - dictIndexDelta);
+                    while (((ip>anchor) & (dictMatchL3>dictStart)) && (ip[-1] == dictMatchL3[-1])) { ip--; dictMatchL3--; mLength++; } /* catch up */
+                    goto _match_found;
+        }   }   }
+
+        /* if no long +1 match, explore the short match we found */
+        if (matchIndexS < prefixLowestIndex) {
+            mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4;
+            offset = (U32)(curr - matchIndexS);
+            while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        } else {
+            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+            offset = (U32)(ip - match);
+            while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        }
+
+_match_found:
+        offset_2 = offset_1;
+        offset_1 = offset;
+
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+_match_stored:
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Complementary insertion */
+            /* done after iLimit test, as candidates could be > iend-8 */
+            {   U32 const indexToInsert = curr+2;
+                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+            }
+
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ?
+                        dictBase + repIndex2 - dictIndexDelta :
+                        base + repIndex2;
+                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+            }
+        }
+    }   /* while (ip < ilimit) */
+
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+#define ZSTD_GEN_DFAST_FN(dictMode, mls)                                                                 \
+    static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls(                                      \
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                          \
+            void const* src, size_t srcSize)                                                             \
+    {                                                                                                    \
+        return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \
+    }
+
+ZSTD_GEN_DFAST_FN(noDict, 4)
+ZSTD_GEN_DFAST_FN(noDict, 5)
+ZSTD_GEN_DFAST_FN(noDict, 6)
+ZSTD_GEN_DFAST_FN(noDict, 7)
+
+ZSTD_GEN_DFAST_FN(dictMatchState, 4)
+ZSTD_GEN_DFAST_FN(dictMatchState, 5)
+ZSTD_GEN_DFAST_FN(dictMatchState, 6)
+ZSTD_GEN_DFAST_FN(dictMatchState, 7)
+
+
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    const U32 mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize);
+    }
+}
+
+
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    const U32 mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize);
+    }
+}
+
+
+static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const mls /* template */)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32* const hashLong = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+    const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+    const U32   dictStartIndex = lowLimit;
+    const U32   dictLimit = ms->window.dictLimit;
+    const U32   prefixStartIndex = (dictLimit > lowLimit) ? dictLimit : lowLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_extDict_generic (srcSize=%zu)", srcSize);
+
+    /* if extDict is invalidated due to maxDistance, switch to "regular" variant */
+    if (prefixStartIndex == dictStartIndex)
+        return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize);
+
+    /* Search Loop */
+    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+        const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
+        const U32 matchIndex = hashSmall[hSmall];
+        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* match = matchBase + matchIndex;
+
+        const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
+        const U32 matchLongIndex = hashLong[hLong];
+        const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* matchLong = matchLongBase + matchLongIndex;
+
+        const U32 curr = (U32)(ip-base);
+        const U32 repIndex = curr + 1 - offset_1;   /* offset_1 expected <= curr +1 */
+        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* const repMatch = repBase + repIndex;
+        size_t mLength;
+        hashSmall[hSmall] = hashLong[hLong] = curr;   /* update hash table */
+
+        if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
+            & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */
+          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+        } else {
+            if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+                const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart;
+                U32 offset;
+                mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, prefixStart) + 8;
+                offset = curr - matchLongIndex;
+                while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+            } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+                U32 const matchIndex3 = hashLong[h3];
+                const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base;
+                const BYTE* match3 = match3Base + matchIndex3;
+                U32 offset;
+                hashLong[h3] = curr + 1;
+                if ( (matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
+                    const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, prefixStart) + 8;
+                    ip++;
+                    offset = curr+1 - matchIndex3;
+                    while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */
+                } else {
+                    const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+                    offset = curr - matchIndex;
+                    while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+                }
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+            } else {
+                ip += ((ip-anchor) >> kSearchStrength) + 1;
+                continue;
+        }   }
+
+        /* move to next sequence start */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Complementary insertion */
+            /* done after iLimit test, as candidates could be > iend-8 */
+            {   U32 const indexToInsert = curr+2;
+                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+            }
+
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3)   /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
+                    & (offset_2 <= current2 - dictStartIndex))
+                  && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+    }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+ZSTD_GEN_DFAST_FN(extDict, 4)
+ZSTD_GEN_DFAST_FN(extDict, 5)
+ZSTD_GEN_DFAST_FN(extDict, 6)
+ZSTD_GEN_DFAST_FN(extDict, 7)
+
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
+    }
+}
+
+#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
--- a/third_party/zstd/lib/compress/zstd_double_fast.h
+++ b/third_party/zstd/lib/compress/zstd_double_fast.h
@ -0,0 +1,51 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_DOUBLE_FAST_H
+#define ZSTD_DOUBLE_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "third_party/zstd/lib/common/mem.h"      /* U32 */
+#include "third_party/zstd/lib/compress/zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+
+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                              ZSTD_tableFillPurpose_e tfp);
+
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict
+#else
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL
+#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_DOUBLE_FAST_H */
--- a/third_party/zstd/lib/compress/zstd_fast.c
+++ b/third_party/zstd/lib/compress/zstd_fast.c
@ -0,0 +1,961 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "third_party/zstd/lib/compress/zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
+#include "third_party/zstd/lib/compress/zstd_fast.h"
+
+static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_full);
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
+            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
+
+        if (dtlm == ZSTD_dtlm_fast) continue;
+        /* Only load extra positions for ZSTD_dtlm_full */
+        {   U32 p;
+            for (p = 1; p < fastHashFillStep; ++p) {
+                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
+                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
+                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
+                }   }   }   }
+}
+
+static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog;
+    U32  const mls = cParams->minMatch;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_fast);
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);
+        hashTable[hash0] = curr;
+        if (dtlm == ZSTD_dtlm_fast) continue;
+        /* Only load extra positions for ZSTD_dtlm_full */
+        {   U32 p;
+            for (p = 1; p < fastHashFillStep; ++p) {
+                size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls);
+                if (hashTable[hash] == 0) {  /* not yet filled */
+                    hashTable[hash] = curr + p;
+    }   }   }   }
+}
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
+    }
+}
+
+
+/**
+ * If you squint hard enough (and ignore repcodes), the search operation at any
+ * given position is broken into 4 stages:
+ *
+ * 1. Hash   (map position to hash value via input read)
+ * 2. Lookup (map hash val to index via hashtable read)
+ * 3. Load   (map index to value at that position via input read)
+ * 4. Compare
+ *
+ * Each of these steps involves a memory read at an address which is computed
+ * from the previous step. This means these steps must be sequenced and their
+ * latencies are cumulative.
+ *
+ * Rather than do 1->2->3->4 sequentially for a single position before moving
+ * onto the next, this implementation interleaves these operations across the
+ * next few positions:
+ *
+ * R = Repcode Read & Compare
+ * H = Hash
+ * T = Table Lookup
+ * M = Match Read & Compare
+ *
+ * Pos | Time -->
+ * ----+-------------------
+ * N   | ... M
+ * N+1 | ...   TM
+ * N+2 |    R H   T M
+ * N+3 |         H    TM
+ * N+4 |           R H   T M
+ * N+5 |                H   ...
+ * N+6 |                  R ...
+ *
+ * This is very much analogous to the pipelining of execution in a CPU. And just
+ * like a CPU, we have to dump the pipeline when we find a match (i.e., take a
+ * branch).
+ *
+ * When this happens, we throw away our current state, and do the following prep
+ * to re-enter the loop:
+ *
+ * Pos | Time -->
+ * ----+-------------------
+ * N   | H T
+ * N+1 |  H
+ *
+ * This is also the work we do at the beginning to enter the loop initially.
+ */
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_fast_noDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const mls, U32 const hasStep)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+    const U32   prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+
+    const BYTE* anchor = istart;
+    const BYTE* ip0 = istart;
+    const BYTE* ip1;
+    const BYTE* ip2;
+    const BYTE* ip3;
+    U32 current0;
+
+    U32 rep_offset1 = rep[0];
+    U32 rep_offset2 = rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    size_t hash0; /* hash for ip0 */
+    size_t hash1; /* hash for ip1 */
+    U32 idx; /* match idx for ip0 */
+    U32 mval; /* src value at match idx */
+
+    U32 offcode;
+    const BYTE* match0;
+    size_t mLength;
+
+    /* ip0 and ip1 are always adjacent. The targetLength skipping and
+     * uncompressibility acceleration is applied to every other position,
+     * matching the behavior of #1562. step therefore represents the gap
+     * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
+    size_t step;
+    const BYTE* nextStep;
+    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+
+    DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
+    ip0 += (ip0 == prefixStart);
+    {   U32 const curr = (U32)(ip0 - base);
+        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
+        U32 const maxRep = curr - windowLow;
+        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
+        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
+    }
+
+    /* start each op */
+_start: /* Requires: ip0 */
+
+    step = stepSize;
+    nextStep = ip0 + kStepIncr;
+
+    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
+    ip1 = ip0 + 1;
+    ip2 = ip0 + step;
+    ip3 = ip2 + 1;
+
+    if (ip3 >= ilimit) {
+        goto _cleanup;
+    }
+
+    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+
+    idx = hashTable[hash0];
+
+    do {
+        /* load repcode match for ip[2]*/
+        const U32 rval = MEM_read32(ip2 - rep_offset1);
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        /* check repcode at ip[2] */
+        if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) {
+            ip0 = ip2;
+            match0 = ip0 - rep_offset1;
+            mLength = ip0[-1] == match0[-1];
+            ip0 -= mLength;
+            match0 -= mLength;
+            offcode = REPCODE1_TO_OFFBASE;
+            mLength += 4;
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 is before the
+             * repcode (ip2). */
+            hashTable[hash1] = (U32)(ip1 - base);
+
+            goto _match;
+        }
+
+        /* load match for ip[0] */
+        if (idx >= prefixStartIndex) {
+            mval = MEM_read32(base + idx);
+        } else {
+            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
+        }
+
+        /* check match at ip[0] */
+        if (MEM_read32(ip0) == mval) {
+            /* found a match! */
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 == ip0 + 1, so
+             * we know we will resume searching after ip1 */
+            hashTable[hash1] = (U32)(ip1 - base);
+
+            goto _offset;
+        }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip3;
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        /* load match for ip[0] */
+        if (idx >= prefixStartIndex) {
+            mval = MEM_read32(base + idx);
+        } else {
+            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
+        }
+
+        /* check match at ip[0] */
+        if (MEM_read32(ip0) == mval) {
+            /* found a match! */
+
+            /* first write next hash table entry; we've already calculated it */
+            if (step <= 4) {
+                /* We need to avoid writing an index into the hash table >= the
+                 * position at which we will pick up our searching after we've
+                 * taken this match.
+                 *
+                 * The minimum possible match has length 4, so the earliest ip0
+                 * can be after we take this match will be the current ip0 + 4.
+                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
+                 * write this position.
+                 */
+                hashTable[hash1] = (U32)(ip1 - base);
+            }
+
+            goto _offset;
+        }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip0 + step;
+        ip3 = ip1 + step;
+
+        /* calculate step */
+        if (ip2 >= nextStep) {
+            step++;
+            PREFETCH_L1(ip1 + 64);
+            PREFETCH_L1(ip1 + 128);
+            nextStep += kStepIncr;
+        }
+    } while (ip3 < ilimit);
+
+_cleanup:
+    /* Note that there are probably still a couple positions we could search.
+     * However, it seems to be a meaningful performance hit to try to search
+     * them. So let's not. */
+
+    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
+     * When the offsets are still zero, we need to restore them after the block to have a correct
+     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
+     * offsets were invalid. We need to figure out which offset to refill with.
+     *     - If both offsets are zero they are in the same order.
+     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
+     *     - If only one is zero, we need to decide which offset to restore.
+     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
+     *         - It is impossible for rep_offset2 to be non-zero.
+     *
+     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
+     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
+     */
+    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+    /* save reps for next block */
+    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
+    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+
+_offset: /* Requires: ip0, idx */
+
+    /* Compute the offset code. */
+    match0 = base + idx;
+    rep_offset2 = rep_offset1;
+    rep_offset1 = (U32)(ip0-match0);
+    offcode = OFFSET_TO_OFFBASE(rep_offset1);
+    mLength = 4;
+
+    /* Count the backwards match length. */
+    while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) {
+        ip0--;
+        match0--;
+        mLength++;
+    }
+
+_match: /* Requires: ip0, match0, offcode */
+
+    /* Count the forward length. */
+    mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend);
+
+    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
+
+    ip0 += mLength;
+    anchor = ip0;
+
+    /* Fill table and check for immediate repcode. */
+    if (ip0 <= ilimit) {
+        /* Fill Table */
+        assert(base+current0+2 > istart);  /* check base overflow */
+        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
+        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+        if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */
+            while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) {
+                /* store sequence */
+                size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4;
+                { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
+                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                ip0 += rLength;
+                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+                anchor = ip0;
+                continue;   /* faster when present (confirmed on gcc-8) ... (?) */
+    }   }   }
+
+    goto _start;
+}
+
+#define ZSTD_GEN_FAST_FN(dictMode, mls, step)                                                            \
+    static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step(                                      \
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                    \
+            void const* src, size_t srcSize)                                                       \
+    {                                                                                              \
+        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \
+    }
+
+ZSTD_GEN_FAST_FN(noDict, 4, 1)
+ZSTD_GEN_FAST_FN(noDict, 5, 1)
+ZSTD_GEN_FAST_FN(noDict, 6, 1)
+ZSTD_GEN_FAST_FN(noDict, 7, 1)
+
+ZSTD_GEN_FAST_FN(noDict, 4, 0)
+ZSTD_GEN_FAST_FN(noDict, 5, 0)
+ZSTD_GEN_FAST_FN(noDict, 6, 0)
+ZSTD_GEN_FAST_FN(noDict, 7, 0)
+
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState == NULL);
+    if (ms->cParams.targetLength > 1) {
+        switch(mls)
+        {
+        default: /* includes case 3 */
+        case 4 :
+            return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize);
+        case 5 :
+            return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize);
+        case 6 :
+            return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize);
+        case 7 :
+            return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
+        }
+    } else {
+        switch(mls)
+        {
+        default: /* includes case 3 */
+        case 4 :
+            return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize);
+        case 5 :
+            return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize);
+        case 6 :
+            return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize);
+        case 7 :
+            return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
+        }
+
+    }
+}
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip0 = istart;
+    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
+    const BYTE* anchor = istart;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+    const U32* const dictHashTable = dms->hashTable;
+    const U32 dictStartIndex       = dms->window.dictLimit;
+    const BYTE* const dictBase     = dms->window.base;
+    const BYTE* const dictStart    = dictBase + dictStartIndex;
+    const BYTE* const dictEnd      = dms->window.nextSrc;
+    const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
+    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
+    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+
+    /* if a dictionary is still attached, it necessarily means that
+     * it is within window size. So we just check it. */
+    const U32 maxDistance = 1U << cParams->windowLog;
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+    assert(endIndex - prefixStartIndex <= maxDistance);
+    (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
+
+    (void)hasStep; /* not currently specialized on whether it's accelerated */
+
+    /* ensure there will be no underflow
+     * when translating a dict index into a local index */
+    assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashTable, hashTableBytes)
+    }
+
+    /* init */
+    DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+    ip0 += (dictAndPrefixLength == 0);
+    /* dictMatchState repCode checks don't currently handle repCode == 0
+     * disabling. */
+    assert(offset_1 <= dictAndPrefixLength);
+    assert(offset_2 <= dictAndPrefixLength);
+
+    /* Outer search loop */
+    assert(stepSize >= 1);
+    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
+        size_t mLength;
+        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+
+        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
+        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
+
+        U32 matchIndex = hashTable[hash0];
+        U32 curr = (U32)(ip0 - base);
+        size_t step = stepSize;
+        const size_t kStepIncr = 1 << kSearchStrength;
+        const BYTE* nextStep = ip0 + kStepIncr;
+
+        /* Inner search loop */
+        while (1) {
+            const BYTE* match = base + matchIndex;
+            const U32 repIndex = curr + 1 - offset_1;
+            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
+            hashTable[hash0] = curr;   /* update hash table */
+
+            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
+                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
+                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
+                ip0++;
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                break;
+            }
+
+            if (dictTagsMatch) {
+                /* Found a possible dict match */
+                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+                const BYTE* dictMatch = dictBase + dictMatchIndex;
+                if (dictMatchIndex > dictStartIndex &&
+                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
+                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
+                    if (matchIndex <= prefixStartIndex) {
+                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
+                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
+                        while (((ip0 > anchor) & (dictMatch > dictStart))
+                            && (ip0[-1] == dictMatch[-1])) {
+                            ip0--;
+                            dictMatch--;
+                            mLength++;
+                        } /* catch up */
+                        offset_2 = offset_1;
+                        offset_1 = offset;
+                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                        break;
+                    }
+                }
+            }
+
+            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
+                /* found a regular match */
+                U32 const offset = (U32) (ip0 - match);
+                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
+                while (((ip0 > anchor) & (match > prefixStart))
+                       && (ip0[-1] == match[-1])) {
+                    ip0--;
+                    match--;
+                    mLength++;
+                } /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                break;
+            }
+
+            /* Prepare for next iteration */
+            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
+            matchIndex = hashTable[hash1];
+
+            if (ip1 >= nextStep) {
+                step++;
+                nextStep += kStepIncr;
+            }
+            ip0 = ip1;
+            ip1 = ip1 + step;
+            if (ip1 > ilimit) goto _cleanup;
+
+            curr = (U32)(ip0 - base);
+            hash0 = hash1;
+        }   /* end inner search loop */
+
+        /* match found */
+        assert(mLength);
+        ip0 += mLength;
+        anchor = ip0;
+
+        if (ip0 <= ilimit) {
+            /* Fill Table */
+            assert(base+curr+2 > istart);  /* check base overflow */
+            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
+            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+            /* check immediate repcode */
+            while (ip0 <= ilimit) {
+                U32 const current2 = (U32)(ip0-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                        dictBase - dictIndexDelta + repIndex2 :
+                        base + repIndex2;
+                if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
+                    ip0 += repLength2;
+                    anchor = ip0;
+                    continue;
+                }
+                break;
+            }
+        }
+
+        /* Prepare for next iteration */
+        assert(ip0 == anchor);
+        ip1 = ip0 + stepSize;
+    }
+
+_cleanup:
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+
+ZSTD_GEN_FAST_FN(dictMatchState, 4, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 5, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
+
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState != NULL);
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);
+    }
+}
+
+
+static size_t ZSTD_compressBlock_fast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* anchor = istart;
+    const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+    const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+    const U32   dictStartIndex = lowLimit;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const U32   dictLimit = ms->window.dictLimit;
+    const U32   prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    const BYTE* ip0 = istart;
+    const BYTE* ip1;
+    const BYTE* ip2;
+    const BYTE* ip3;
+    U32 current0;
+
+
+    size_t hash0; /* hash for ip0 */
+    size_t hash1; /* hash for ip1 */
+    U32 idx; /* match idx for ip0 */
+    const BYTE* idxBase; /* base pointer for idx */
+
+    U32 offcode;
+    const BYTE* match0;
+    size_t mLength;
+    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
+
+    size_t step;
+    const BYTE* nextStep;
+    const size_t kStepIncr = (1 << (kSearchStrength - 1));
+
+    (void)hasStep; /* not currently specialized on whether it's accelerated */
+
+    DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
+
+    /* switch to "regular" variant if extDict is invalidated due to maxDistance */
+    if (prefixStartIndex == dictStartIndex)
+        return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
+
+    {   U32 const curr = (U32)(ip0 - base);
+        U32 const maxRep = curr - dictStartIndex;
+        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+    }
+
+    /* start each op */
+_start: /* Requires: ip0 */
+
+    step = stepSize;
+    nextStep = ip0 + kStepIncr;
+
+    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
+    ip1 = ip0 + 1;
+    ip2 = ip0 + step;
+    ip3 = ip2 + 1;
+
+    if (ip3 >= ilimit) {
+        goto _cleanup;
+    }
+
+    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+
+    idx = hashTable[hash0];
+    idxBase = idx < prefixStartIndex ? dictBase : base;
+
+    do {
+        {   /* load repcode match for ip[2] */
+            U32 const current2 = (U32)(ip2 - base);
+            U32 const repIndex = current2 - offset_1;
+            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+            U32 rval;
+            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
+                 & (offset_1 > 0) ) {
+                rval = MEM_read32(repBase + repIndex);
+            } else {
+                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
+            }
+
+            /* write back hash table entry */
+            current0 = (U32)(ip0 - base);
+            hashTable[hash0] = current0;
+
+            /* check repcode at ip[2] */
+            if (MEM_read32(ip2) == rval) {
+                ip0 = ip2;
+                match0 = repBase + repIndex;
+                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                assert((match0 != prefixStart) & (match0 != dictStart));
+                mLength = ip0[-1] == match0[-1];
+                ip0 -= mLength;
+                match0 -= mLength;
+                offcode = REPCODE1_TO_OFFBASE;
+                mLength += 4;
+                goto _match;
+        }   }
+
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip3;
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip0 + step;
+        ip3 = ip1 + step;
+
+        /* calculate step */
+        if (ip2 >= nextStep) {
+            step++;
+            PREFETCH_L1(ip1 + 64);
+            PREFETCH_L1(ip1 + 128);
+            nextStep += kStepIncr;
+        }
+    } while (ip3 < ilimit);
+
+_cleanup:
+    /* Note that there are probably still a couple positions we could search.
+     * However, it seems to be a meaningful performance hit to try to search
+     * them. So let's not. */
+
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+
+_offset: /* Requires: ip0, idx, idxBase */
+
+    /* Compute the offset code. */
+    {   U32 const offset = current0 - idx;
+        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
+        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
+        match0 = idxBase + idx;
+        offset_2 = offset_1;
+        offset_1 = offset;
+        offcode = OFFSET_TO_OFFBASE(offset);
+        mLength = 4;
+
+        /* Count the backwards match length. */
+        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
+            ip0--;
+            match0--;
+            mLength++;
+    }   }
+
+_match: /* Requires: ip0, match0, offcode, matchEnd */
+
+    /* Count the forward length. */
+    assert(matchEnd != 0);
+    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
+
+    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
+
+    ip0 += mLength;
+    anchor = ip0;
+
+    /* write next hash table entry */
+    if (ip1 < ip0) {
+        hashTable[hash1] = (U32)(ip1 - base);
+    }
+
+    /* Fill table and check for immediate repcode. */
+    if (ip0 <= ilimit) {
+        /* Fill Table */
+        assert(base+current0+2 > istart);  /* check base overflow */
+        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
+        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+        while (ip0 <= ilimit) {
+            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
+            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+            if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0))  /* intentional underflow */
+                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
+                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                ip0 += repLength2;
+                anchor = ip0;
+                continue;
+            }
+            break;
+    }   }
+
+    goto _start;
+}
+
+ZSTD_GEN_FAST_FN(extDict, 4, 0)
+ZSTD_GEN_FAST_FN(extDict, 5, 0)
+ZSTD_GEN_FAST_FN(extDict, 6, 0)
+ZSTD_GEN_FAST_FN(extDict, 7, 0)
+
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState == NULL);
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);
+    case 5 :
+        return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);
+    case 6 :
+        return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);
+    case 7 :
+        return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);
+    }
+}
--- a/third_party/zstd/lib/compress/zstd_fast.h
+++ b/third_party/zstd/lib/compress/zstd_fast.h
@ -0,0 +1,39 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_FAST_H
+#define ZSTD_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "third_party/zstd/lib/common/mem.h"      /* U32 */
+#include "third_party/zstd/lib/compress/zstd_compress_internal.h"
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp);
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FAST_H */
--- a/third_party/zstd/lib/compress/zstd_lazy.c
+++ b/third_party/zstd/lib/compress/zstd_lazy.c
--- a/third_party/zstd/lib/compress/zstd_lazy.h
+++ b/third_party/zstd/lib/compress/zstd_lazy.h
@ -0,0 +1,203 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LAZY_H
+#define ZSTD_LAZY_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "third_party/zstd/lib/compress/zstd_compress_internal.h"
+
+/**
+ * Dedicated Dictionary Search Structure bucket log. In the
+ * ZSTD_dedicatedDictSearch mode, the hashTable has
+ * 2 ** ZSTD_LAZY_DDSS_BUCKET_LOG entries in each bucket, rather than just
+ * one.
+ */
+#define ZSTD_LAZY_DDSS_BUCKET_LOG 2
+
+#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+
+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
+
+void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
+
+void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
+#endif
+
+#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy
+#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row
+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState
+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row
+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch
+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row
+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict
+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row
+#else
+#define ZSTD_COMPRESSBLOCK_GREEDY NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL
+#endif
+
+#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy
+#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row
+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState
+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row
+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch
+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row
+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict
+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row
+#else
+#define ZSTD_COMPRESSBLOCK_LAZY NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL
+#endif
+
+#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2
+#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row
+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState
+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row
+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch
+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row
+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict
+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row
+#else
+#define ZSTD_COMPRESSBLOCK_LAZY2 NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2
+#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState
+#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict
+#else
+#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL
+#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL
+#endif
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_LAZY_H */
--- a/third_party/zstd/lib/compress/zstd_ldm.c
+++ b/third_party/zstd/lib/compress/zstd_ldm.c
@ -0,0 +1,729 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "third_party/zstd/lib/compress/zstd_ldm.h"
+
+#include "third_party/zstd/lib/common/debug.h"
+#include "third_party/zstd/lib/common/xxhash.h"
+#include "third_party/zstd/lib/compress/zstd_fast.h"          /* ZSTD_fillHashTable() */
+#include "third_party/zstd/lib/compress/zstd_double_fast.h"   /* ZSTD_fillDoubleHashTable() */
+#include "third_party/zstd/lib/compress/zstd_ldm_geartab.h"
+
+#define LDM_BUCKET_SIZE_LOG 3
+#define LDM_MIN_MATCH_LENGTH 64
+#define LDM_HASH_RLOG 7
+
+typedef struct {
+    U64 rolling;
+    U64 stopMask;
+} ldmRollingHashState_t;
+
+/** ZSTD_ldm_gear_init():
+ *
+ * Initializes the rolling hash state such that it will honor the
+ * settings in params. */
+static void ZSTD_ldm_gear_init(ldmRollingHashState_t* state, ldmParams_t const* params)
+{
+    unsigned maxBitsInMask = MIN(params->minMatchLength, 64);
+    unsigned hashRateLog = params->hashRateLog;
+
+    state->rolling = ~(U32)0;
+
+    /* The choice of the splitting criterion is subject to two conditions:
+     *   1. it has to trigger on average every 2^(hashRateLog) bytes;
+     *   2. ideally, it has to depend on a window of minMatchLength bytes.
+     *
+     * In the gear hash algorithm, bit n depends on the last n bytes;
+     * so in order to obtain a good quality splitting criterion it is
+     * preferable to use bits with high weight.
+     *
+     * To match condition 1 we use a mask with hashRateLog bits set
+     * and, because of the previous remark, we make sure these bits
+     * have the highest possible weight while still respecting
+     * condition 2.
+     */
+    if (hashRateLog > 0 && hashRateLog <= maxBitsInMask) {
+        state->stopMask = (((U64)1 << hashRateLog) - 1) << (maxBitsInMask - hashRateLog);
+    } else {
+        /* In this degenerate case we simply honor the hash rate. */
+        state->stopMask = ((U64)1 << hashRateLog) - 1;
+    }
+}
+
+/** ZSTD_ldm_gear_reset()
+ * Feeds [data, data + minMatchLength) into the hash without registering any
+ * splits. This effectively resets the hash state. This is used when skipping
+ * over data, either at the beginning of a block, or skipping sections.
+ */
+static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state,
+                                BYTE const* data, size_t minMatchLength)
+{
+    U64 hash = state->rolling;
+    size_t n = 0;
+
+#define GEAR_ITER_ONCE() do {                                  \
+        hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \
+        n += 1;                                                \
+    } while (0)
+    while (n + 3 < minMatchLength) {
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+    }
+    while (n < minMatchLength) {
+        GEAR_ITER_ONCE();
+    }
+#undef GEAR_ITER_ONCE
+}
+
+/** ZSTD_ldm_gear_feed():
+ *
+ * Registers in the splits array all the split points found in the first
+ * size bytes following the data pointer. This function terminates when
+ * either all the data has been processed or LDM_BATCH_SIZE splits are
+ * present in the splits array.
+ *
+ * Precondition: The splits array must not be full.
+ * Returns: The number of bytes processed. */
+static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state,
+                                 BYTE const* data, size_t size,
+                                 size_t* splits, unsigned* numSplits)
+{
+    size_t n;
+    U64 hash, mask;
+
+    hash = state->rolling;
+    mask = state->stopMask;
+    n = 0;
+
+#define GEAR_ITER_ONCE() do { \
+        hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \
+        n += 1; \
+        if (UNLIKELY((hash & mask) == 0)) { \
+            splits[*numSplits] = n; \
+            *numSplits += 1; \
+            if (*numSplits == LDM_BATCH_SIZE) \
+                goto done; \
+        } \
+    } while (0)
+
+    while (n + 3 < size) {
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+    }
+    while (n < size) {
+        GEAR_ITER_ONCE();
+    }
+
+#undef GEAR_ITER_ONCE
+
+done:
+    state->rolling = hash;
+    return n;
+}
+
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams)
+{
+    params->windowLog = cParams->windowLog;
+    ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
+    DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
+    if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
+    if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
+    if (params->hashLog == 0) {
+        params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
+        assert(params->hashLog <= ZSTD_HASHLOG_MAX);
+    }
+    if (params->hashRateLog == 0) {
+        params->hashRateLog = params->windowLog < params->hashLog
+                                   ? 0
+                                   : params->windowLog - params->hashLog;
+    }
+    params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
+}
+
+size_t ZSTD_ldm_getTableSize(ldmParams_t params)
+{
+    size_t const ldmHSize = ((size_t)1) << params.hashLog;
+    size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog);
+    size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog);
+    size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize)
+                           + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t));
+    return params.enableLdm == ZSTD_ps_enable ? totalSize : 0;
+}
+
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
+{
+    return params.enableLdm == ZSTD_ps_enable ? (maxChunkSize / params.minMatchLength) : 0;
+}
+
+/** ZSTD_ldm_getBucket() :
+ *  Returns a pointer to the start of the bucket associated with hash. */
+static ldmEntry_t* ZSTD_ldm_getBucket(
+        ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams)
+{
+    return ldmState->hashTable + (hash << ldmParams.bucketSizeLog);
+}
+
+/** ZSTD_ldm_insertEntry() :
+ *  Insert the entry with corresponding hash into the hash table */
+static void ZSTD_ldm_insertEntry(ldmState_t* ldmState,
+                                 size_t const hash, const ldmEntry_t entry,
+                                 ldmParams_t const ldmParams)
+{
+    BYTE* const pOffset = ldmState->bucketOffsets + hash;
+    unsigned const offset = *pOffset;
+
+    *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + offset) = entry;
+    *pOffset = (BYTE)((offset + 1) & ((1u << ldmParams.bucketSizeLog) - 1));
+
+}
+
+/** ZSTD_ldm_countBackwardsMatch() :
+ *  Returns the number of bytes that match backwards before pIn and pMatch.
+ *
+ *  We count only bytes where pMatch >= pBase and pIn >= pAnchor. */
+static size_t ZSTD_ldm_countBackwardsMatch(
+            const BYTE* pIn, const BYTE* pAnchor,
+            const BYTE* pMatch, const BYTE* pMatchBase)
+{
+    size_t matchLength = 0;
+    while (pIn > pAnchor && pMatch > pMatchBase && pIn[-1] == pMatch[-1]) {
+        pIn--;
+        pMatch--;
+        matchLength++;
+    }
+    return matchLength;
+}
+
+/** ZSTD_ldm_countBackwardsMatch_2segments() :
+ *  Returns the number of bytes that match backwards from pMatch,
+ *  even with the backwards match spanning 2 different segments.
+ *
+ *  On reaching `pMatchBase`, start counting from mEnd */
+static size_t ZSTD_ldm_countBackwardsMatch_2segments(
+                    const BYTE* pIn, const BYTE* pAnchor,
+                    const BYTE* pMatch, const BYTE* pMatchBase,
+                    const BYTE* pExtDictStart, const BYTE* pExtDictEnd)
+{
+    size_t matchLength = ZSTD_ldm_countBackwardsMatch(pIn, pAnchor, pMatch, pMatchBase);
+    if (pMatch - matchLength != pMatchBase || pMatchBase == pExtDictStart) {
+        /* If backwards match is entirely in the extDict or prefix, immediately return */
+        return matchLength;
+    }
+    DEBUGLOG(7, "ZSTD_ldm_countBackwardsMatch_2segments: found 2-parts backwards match (length in prefix==%zu)", matchLength);
+    matchLength += ZSTD_ldm_countBackwardsMatch(pIn - matchLength, pAnchor, pExtDictEnd, pExtDictStart);
+    DEBUGLOG(7, "final backwards match length = %zu", matchLength);
+    return matchLength;
+}
+
+/** ZSTD_ldm_fillFastTables() :
+ *
+ *  Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies.
+ *  This is similar to ZSTD_loadDictionaryContent.
+ *
+ *  The tables for the other strategies are filled within their
+ *  block compressors. */
+static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+                                      void const* end)
+{
+    const BYTE* const iend = (const BYTE*)end;
+
+    switch(ms->cParams.strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+        break;
+
+    case ZSTD_dfast:
+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+#else
+        assert(0); /* shouldn't be called: cparams should've been adjusted. */
+#endif
+        break;
+
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+    case ZSTD_btlazy2:
+    case ZSTD_btopt:
+    case ZSTD_btultra:
+    case ZSTD_btultra2:
+        break;
+    default:
+        assert(0);  /* not possible : not a valid strategy id */
+    }
+
+    return 0;
+}
+
+void ZSTD_ldm_fillHashTable(
+            ldmState_t* ldmState, const BYTE* ip,
+            const BYTE* iend, ldmParams_t const* params)
+{
+    U32 const minMatchLength = params->minMatchLength;
+    U32 const hBits = params->hashLog - params->bucketSizeLog;
+    BYTE const* const base = ldmState->window.base;
+    BYTE const* const istart = ip;
+    ldmRollingHashState_t hashState;
+    size_t* const splits = ldmState->splitIndices;
+    unsigned numSplits;
+
+    DEBUGLOG(5, "ZSTD_ldm_fillHashTable");
+
+    ZSTD_ldm_gear_init(&hashState, params);
+    while (ip < iend) {
+        size_t hashed;
+        unsigned n;
+
+        numSplits = 0;
+        hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits);
+
+        for (n = 0; n < numSplits; n++) {
+            if (ip + splits[n] >= istart + minMatchLength) {
+                BYTE const* const split = ip + splits[n] - minMatchLength;
+                U64 const xxhash = XXH64(split, minMatchLength, 0);
+                U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1));
+                ldmEntry_t entry;
+
+                entry.offset = (U32)(split - base);
+                entry.checksum = (U32)(xxhash >> 32);
+                ZSTD_ldm_insertEntry(ldmState, hash, entry, *params);
+            }
+        }
+
+        ip += hashed;
+    }
+}
+
+
+/** ZSTD_ldm_limitTableUpdate() :
+ *
+ *  Sets cctx->nextToUpdate to a position corresponding closer to anchor
+ *  if it is far way
+ *  (after a long match, only update tables a limited amount). */
+static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
+{
+    U32 const curr = (U32)(anchor - ms->window.base);
+    if (curr > ms->nextToUpdate + 1024) {
+        ms->nextToUpdate =
+            curr - MIN(512, curr - ms->nextToUpdate - 1024);
+    }
+}
+
+static size_t ZSTD_ldm_generateSequences_internal(
+        ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    /* LDM parameters */
+    int const extDict = ZSTD_window_hasExtDict(ldmState->window);
+    U32 const minMatchLength = params->minMatchLength;
+    U32 const entsPerBucket = 1U << params->bucketSizeLog;
+    U32 const hBits = params->hashLog - params->bucketSizeLog;
+    /* Prefix and extDict parameters */
+    U32 const dictLimit = ldmState->window.dictLimit;
+    U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit;
+    BYTE const* const base = ldmState->window.base;
+    BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL;
+    BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL;
+    BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL;
+    BYTE const* const lowPrefixPtr = base + dictLimit;
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    BYTE const* const ilimit = iend - HASH_READ_SIZE;
+    /* Input positions */
+    BYTE const* anchor = istart;
+    BYTE const* ip = istart;
+    /* Rolling hash state */
+    ldmRollingHashState_t hashState;
+    /* Arrays for staged-processing */
+    size_t* const splits = ldmState->splitIndices;
+    ldmMatchCandidate_t* const candidates = ldmState->matchCandidates;
+    unsigned numSplits;
+
+    if (srcSize < minMatchLength)
+        return iend - anchor;
+
+    /* Initialize the rolling hash state with the first minMatchLength bytes */
+    ZSTD_ldm_gear_init(&hashState, params);
+    ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength);
+    ip += minMatchLength;
+
+    while (ip < ilimit) {
+        size_t hashed;
+        unsigned n;
+
+        numSplits = 0;
+        hashed = ZSTD_ldm_gear_feed(&hashState, ip, ilimit - ip,
+                                    splits, &numSplits);
+
+        for (n = 0; n < numSplits; n++) {
+            BYTE const* const split = ip + splits[n] - minMatchLength;
+            U64 const xxhash = XXH64(split, minMatchLength, 0);
+            U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1));
+
+            candidates[n].split = split;
+            candidates[n].hash = hash;
+            candidates[n].checksum = (U32)(xxhash >> 32);
+            candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, *params);
+            PREFETCH_L1(candidates[n].bucket);
+        }
+
+        for (n = 0; n < numSplits; n++) {
+            size_t forwardMatchLength = 0, backwardMatchLength = 0,
+                   bestMatchLength = 0, mLength;
+            U32 offset;
+            BYTE const* const split = candidates[n].split;
+            U32 const checksum = candidates[n].checksum;
+            U32 const hash = candidates[n].hash;
+            ldmEntry_t* const bucket = candidates[n].bucket;
+            ldmEntry_t const* cur;
+            ldmEntry_t const* bestEntry = NULL;
+            ldmEntry_t newEntry;
+
+            newEntry.offset = (U32)(split - base);
+            newEntry.checksum = checksum;
+
+            /* If a split point would generate a sequence overlapping with
+             * the previous one, we merely register it in the hash table and
+             * move on */
+            if (split < anchor) {
+                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
+                continue;
+            }
+
+            for (cur = bucket; cur < bucket + entsPerBucket; cur++) {
+                size_t curForwardMatchLength, curBackwardMatchLength,
+                       curTotalMatchLength;
+                if (cur->checksum != checksum || cur->offset <= lowestIndex) {
+                    continue;
+                }
+                if (extDict) {
+                    BYTE const* const curMatchBase =
+                        cur->offset < dictLimit ? dictBase : base;
+                    BYTE const* const pMatch = curMatchBase + cur->offset;
+                    BYTE const* const matchEnd =
+                        cur->offset < dictLimit ? dictEnd : iend;
+                    BYTE const* const lowMatchPtr =
+                        cur->offset < dictLimit ? dictStart : lowPrefixPtr;
+                    curForwardMatchLength =
+                        ZSTD_count_2segments(split, pMatch, iend, matchEnd, lowPrefixPtr);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength = ZSTD_ldm_countBackwardsMatch_2segments(
+                            split, anchor, pMatch, lowMatchPtr, dictStart, dictEnd);
+                } else { /* !extDict */
+                    BYTE const* const pMatch = base + cur->offset;
+                    curForwardMatchLength = ZSTD_count(split, pMatch, iend);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength =
+                        ZSTD_ldm_countBackwardsMatch(split, anchor, pMatch, lowPrefixPtr);
+                }
+                curTotalMatchLength = curForwardMatchLength + curBackwardMatchLength;
+
+                if (curTotalMatchLength > bestMatchLength) {
+                    bestMatchLength = curTotalMatchLength;
+                    forwardMatchLength = curForwardMatchLength;
+                    backwardMatchLength = curBackwardMatchLength;
+                    bestEntry = cur;
+                }
+            }
+
+            /* No match found -- insert an entry into the hash table
+             * and process the next candidate match */
+            if (bestEntry == NULL) {
+                ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
+                continue;
+            }
+
+            /* Match found */
+            offset = (U32)(split - base) - bestEntry->offset;
+            mLength = forwardMatchLength + backwardMatchLength;
+            {
+                rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;
+
+                /* Out of sequence storage */
+                if (rawSeqStore->size == rawSeqStore->capacity)
+                    return ERROR(dstSize_tooSmall);
+                seq->litLength = (U32)(split - backwardMatchLength - anchor);
+                seq->matchLength = (U32)mLength;
+                seq->offset = offset;
+                rawSeqStore->size++;
+            }
+
+            /* Insert the current entry into the hash table --- it must be
+             * done after the previous block to avoid clobbering bestEntry */
+            ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
+
+            anchor = split + forwardMatchLength;
+
+            /* If we find a match that ends after the data that we've hashed
+             * then we have a repeating, overlapping, pattern. E.g. all zeros.
+             * If one repetition of the pattern matches our `stopMask` then all
+             * repetitions will. We don't need to insert them all into out table,
+             * only the first one. So skip over overlapping matches.
+             * This is a major speed boost (20x) for compressing a single byte
+             * repeated, when that byte ends up in the table.
+             */
+            if (anchor > ip + hashed) {
+                ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength);
+                /* Continue the outer loop at anchor (ip + hashed == anchor). */
+                ip = anchor - hashed;
+                break;
+            }
+        }
+
+        ip += hashed;
+    }
+
+    return iend - anchor;
+}
+
+/*! ZSTD_ldm_reduceTable() :
+ *  reduce table indexes by `reducerValue` */
+static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
+                                 U32 const reducerValue)
+{
+    U32 u;
+    for (u = 0; u < size; u++) {
+        if (table[u].offset < reducerValue) table[u].offset = 0;
+        else table[u].offset -= reducerValue;
+    }
+}
+
+size_t ZSTD_ldm_generateSequences(
+        ldmState_t* ldmState, rawSeqStore_t* sequences,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    U32 const maxDist = 1U << params->windowLog;
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    size_t const kMaxChunkSize = 1 << 20;
+    size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0);
+    size_t chunk;
+    size_t leftoverSize = 0;
+
+    assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize);
+    /* Check that ZSTD_window_update() has been called for this chunk prior
+     * to passing it to this function.
+     */
+    assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize);
+    /* The input could be very large (in zstdmt), so it must be broken up into
+     * chunks to enforce the maximum distance and handle overflow correction.
+     */
+    assert(sequences->pos <= sequences->size);
+    assert(sequences->size <= sequences->capacity);
+    for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) {
+        BYTE const* const chunkStart = istart + chunk * kMaxChunkSize;
+        size_t const remaining = (size_t)(iend - chunkStart);
+        BYTE const *const chunkEnd =
+            (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize;
+        size_t const chunkSize = chunkEnd - chunkStart;
+        size_t newLeftoverSize;
+        size_t const prevSize = sequences->size;
+
+        assert(chunkStart < iend);
+        /* 1. Perform overflow correction if necessary. */
+        if (ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDist, ldmState->loadedDictEnd, chunkStart, chunkEnd)) {
+            U32 const ldmHSize = 1U << params->hashLog;
+            U32 const correction = ZSTD_window_correctOverflow(
+                &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart);
+            ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction);
+            /* invalidate dictionaries on overflow correction */
+            ldmState->loadedDictEnd = 0;
+        }
+        /* 2. We enforce the maximum offset allowed.
+         *
+         * kMaxChunkSize should be small enough that we don't lose too much of
+         * the window through early invalidation.
+         * TODO: * Test the chunk size.
+         *       * Try invalidation after the sequence generation and test the
+         *         offset against maxDist directly.
+         *
+         * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+         * that any offset used is valid at the END of the sequence, since it may
+         * be split into two sequences. This condition holds when using
+         * ZSTD_window_enforceMaxDist(), but if we move to checking offsets
+         * against maxDist directly, we'll have to carefully handle that case.
+         */
+        ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, &ldmState->loadedDictEnd, NULL);
+        /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */
+        newLeftoverSize = ZSTD_ldm_generateSequences_internal(
+            ldmState, sequences, params, chunkStart, chunkSize);
+        if (ZSTD_isError(newLeftoverSize))
+            return newLeftoverSize;
+        /* 4. We add the leftover literals from previous iterations to the first
+         *    newly generated sequence, or add the `newLeftoverSize` if none are
+         *    generated.
+         */
+        /* Prepend the leftover literals from the last call */
+        if (prevSize < sequences->size) {
+            sequences->seq[prevSize].litLength += (U32)leftoverSize;
+            leftoverSize = newLeftoverSize;
+        } else {
+            assert(newLeftoverSize == chunkSize);
+            leftoverSize += chunkSize;
+        }
+    }
+    return 0;
+}
+
+void
+ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch)
+{
+    while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
+        rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
+        if (srcSize <= seq->litLength) {
+            /* Skip past srcSize literals */
+            seq->litLength -= (U32)srcSize;
+            return;
+        }
+        srcSize -= seq->litLength;
+        seq->litLength = 0;
+        if (srcSize < seq->matchLength) {
+            /* Skip past the first srcSize of the match */
+            seq->matchLength -= (U32)srcSize;
+            if (seq->matchLength < minMatch) {
+                /* The match is too short, omit it */
+                if (rawSeqStore->pos + 1 < rawSeqStore->size) {
+                    seq[1].litLength += seq[0].matchLength;
+                }
+                rawSeqStore->pos++;
+            }
+            return;
+        }
+        srcSize -= seq->matchLength;
+        seq->matchLength = 0;
+        rawSeqStore->pos++;
+    }
+}
+
+/**
+ * If the sequence length is longer than remaining then the sequence is split
+ * between this block and the next.
+ *
+ * Returns the current sequence to handle, or if the rest of the block should
+ * be literals, it returns a sequence with offset == 0.
+ */
+static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
+                                 U32 const remaining, U32 const minMatch)
+{
+    rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos];
+    assert(sequence.offset > 0);
+    /* Likely: No partial sequence */
+    if (remaining >= sequence.litLength + sequence.matchLength) {
+        rawSeqStore->pos++;
+        return sequence;
+    }
+    /* Cut the sequence short (offset == 0 ==> rest is literals). */
+    if (remaining <= sequence.litLength) {
+        sequence.offset = 0;
+    } else if (remaining < sequence.litLength + sequence.matchLength) {
+        sequence.matchLength = remaining - sequence.litLength;
+        if (sequence.matchLength < minMatch) {
+            sequence.offset = 0;
+        }
+    }
+    /* Skip past `remaining` bytes for the future sequences. */
+    ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch);
+    return sequence;
+}
+
+void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {
+    U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
+    while (currPos && rawSeqStore->pos < rawSeqStore->size) {
+        rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos];
+        if (currPos >= currSeq.litLength + currSeq.matchLength) {
+            currPos -= currSeq.litLength + currSeq.matchLength;
+            rawSeqStore->pos++;
+        } else {
+            rawSeqStore->posInSequence = currPos;
+            break;
+        }
+    }
+    if (currPos == 0 || rawSeqStore->pos == rawSeqStore->size) {
+        rawSeqStore->posInSequence = 0;
+    }
+}
+
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+    ZSTD_paramSwitch_e useRowMatchFinder,
+    void const* src, size_t srcSize)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    unsigned const minMatch = cParams->minMatch;
+    ZSTD_blockCompressor const blockCompressor =
+        ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms));
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    /* Input positions */
+    BYTE const* ip = istart;
+
+    DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize);
+    /* If using opt parser, use LDMs only as candidates rather than always accepting them */
+    if (cParams->strategy >= ZSTD_btopt) {
+        size_t lastLLSize;
+        ms->ldmSeqStore = rawSeqStore;
+        lastLLSize = blockCompressor(ms, seqStore, rep, src, srcSize);
+        ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore, srcSize);
+        return lastLLSize;
+    }
+
+    assert(rawSeqStore->pos <= rawSeqStore->size);
+    assert(rawSeqStore->size <= rawSeqStore->capacity);
+    /* Loop through each sequence and apply the block compressor to the literals */
+    while (rawSeqStore->pos < rawSeqStore->size && ip < iend) {
+        /* maybeSplitSequence updates rawSeqStore->pos */
+        rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                   (U32)(iend - ip), minMatch);
+        int i;
+        /* End signal */
+        if (sequence.offset == 0)
+            break;
+
+        assert(ip + sequence.litLength + sequence.matchLength <= iend);
+
+        /* Fill tables for block compressor */
+        ZSTD_ldm_limitTableUpdate(ms, ip);
+        ZSTD_ldm_fillFastTables(ms, ip);
+        /* Run the block compressor */
+        DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
+        {
+            size_t const newLitLength =
+                blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
+            ip += sequence.litLength;
+            /* Update the repcodes */
+            for (i = ZSTD_REP_NUM - 1; i > 0; i--)
+                rep[i] = rep[i-1];
+            rep[0] = sequence.offset;
+            /* Store the sequence */
+            ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+                          OFFSET_TO_OFFBASE(sequence.offset),
+                          sequence.matchLength);
+            ip += sequence.matchLength;
+        }
+    }
+    /* Fill the tables for the block compressor */
+    ZSTD_ldm_limitTableUpdate(ms, ip);
+    ZSTD_ldm_fillFastTables(ms, ip);
+    /* Compress the last literals */
+    return blockCompressor(ms, seqStore, rep, ip, iend - ip);
+}
--- a/third_party/zstd/lib/compress/zstd_ldm.h
+++ b/third_party/zstd/lib/compress/zstd_ldm.h
@ -0,0 +1,118 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LDM_H
+#define ZSTD_LDM_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "third_party/zstd/lib/compress/zstd_compress_internal.h"   /* ldmParams_t, U32 */
+#include "third_party/zstd/zstd.h"   /* ZSTD_CCtx, size_t */
+
+/*-*************************************
+*  Long distance matching
+***************************************/
+
+#define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_LIMIT_DEFAULT
+
+void ZSTD_ldm_fillHashTable(
+            ldmState_t* state, const BYTE* ip,
+            const BYTE* iend, ldmParams_t const* params);
+
+/**
+ * ZSTD_ldm_generateSequences():
+ *
+ * Generates the sequences using the long distance match finder.
+ * Generates long range matching sequences in `sequences`, which parse a prefix
+ * of the source. `sequences` must be large enough to store every sequence,
+ * which can be checked with `ZSTD_ldm_getMaxNbSeq()`.
+ * @returns 0 or an error code.
+ *
+ * NOTE: The user must have called ZSTD_window_update() for all of the input
+ * they have, even if they pass it to ZSTD_ldm_generateSequences() in chunks.
+ * NOTE: This function returns an error if it runs out of space to store
+ *       sequences.
+ */
+size_t ZSTD_ldm_generateSequences(
+            ldmState_t* ldms, rawSeqStore_t* sequences,
+            ldmParams_t const* params, void const* src, size_t srcSize);
+
+/**
+ * ZSTD_ldm_blockCompress():
+ *
+ * Compresses a block using the predefined sequences, along with a secondary
+ * block compressor. The literals section of every sequence is passed to the
+ * secondary block compressor, and those sequences are interspersed with the
+ * predefined sequences. Returns the length of the last literals.
+ * Updates `rawSeqStore.pos` to indicate how many sequences have been consumed.
+ * `rawSeqStore.seq` may also be updated to split the last sequence between two
+ * blocks.
+ * @return The length of the last literals.
+ *
+ * NOTE: The source must be at most the maximum block size, but the predefined
+ * sequences can be any size, and may be longer than the block. In the case that
+ * they are longer than the block, the last sequences may need to be split into
+ * two. We handle that case correctly, and update `rawSeqStore` appropriately.
+ * NOTE: This function does not return any errors.
+ */
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+            ZSTD_paramSwitch_e useRowMatchFinder,
+            void const* src, size_t srcSize);
+
+/**
+ * ZSTD_ldm_skipSequences():
+ *
+ * Skip past `srcSize` bytes worth of sequences in `rawSeqStore`.
+ * Avoids emitting matches less than `minMatch` bytes.
+ * Must be called for data that is not passed to ZSTD_ldm_blockCompress().
+ */
+void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize,
+    U32 const minMatch);
+
+/* ZSTD_ldm_skipRawSeqStoreBytes():
+ * Moves forward in rawSeqStore by nbBytes, updating fields 'pos' and 'posInSequence'.
+ * Not to be used in conjunction with ZSTD_ldm_skipSequences().
+ * Must be called for data with is not passed to ZSTD_ldm_blockCompress().
+ */
+void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes);
+
+/** ZSTD_ldm_getTableSize() :
+ *  Estimate the space needed for long distance matching tables or 0 if LDM is
+ *  disabled.
+ */
+size_t ZSTD_ldm_getTableSize(ldmParams_t params);
+
+/** ZSTD_ldm_getSeqSpace() :
+ *  Return an upper bound on the number of sequences that can be produced by
+ *  the long distance matcher, or 0 if LDM is disabled.
+ */
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize);
+
+/** ZSTD_ldm_adjustParameters() :
+ *  If the params->hashRateLog is not set, set it to its default value based on
+ *  windowLog and params->hashLog.
+ *
+ *  Ensures that params->bucketSizeLog is <= params->hashLog (setting it to
+ *  params->hashLog if it is not).
+ *
+ *  Ensures that the minMatchLength >= targetLength during optimal parsing.
+ */
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FAST_H */
--- a/third_party/zstd/lib/compress/zstd_ldm_geartab.h
+++ b/third_party/zstd/lib/compress/zstd_ldm_geartab.h
@ -0,0 +1,107 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LDM_GEARTAB_H
+#define ZSTD_LDM_GEARTAB_H
+
+#include "third_party/zstd/lib/common/compiler.h" /* UNUSED_ATTR */
+#include "third_party/zstd/lib/common/mem.h"      /* U64 */
+
+static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = {
+    0xf5b8f72c5f77775c, 0x84935f266b7ac412, 0xb647ada9ca730ccc,
+    0xb065bb4b114fb1de, 0x34584e7e8c3a9fd0, 0x4e97e17c6ae26b05,
+    0x3a03d743bc99a604, 0xcecd042422c4044f, 0x76de76c58524259e,
+    0x9c8528f65badeaca, 0x86563706e2097529, 0x2902475fa375d889,
+    0xafb32a9739a5ebe6, 0xce2714da3883e639, 0x21eaf821722e69e,
+    0x37b628620b628,    0x49a8d455d88caf5,  0x8556d711e6958140,
+    0x4f7ae74fc605c1f,  0x829f0c3468bd3a20, 0x4ffdc885c625179e,
+    0x8473de048a3daf1b, 0x51008822b05646b2, 0x69d75d12b2d1cc5f,
+    0x8c9d4a19159154bc, 0xc3cc10f4abbd4003, 0xd06ddc1cecb97391,
+    0xbe48e6e7ed80302e, 0x3481db31cee03547, 0xacc3f67cdaa1d210,
+    0x65cb771d8c7f96cc, 0x8eb27177055723dd, 0xc789950d44cd94be,
+    0x934feadc3700b12b, 0x5e485f11edbdf182, 0x1e2e2a46fd64767a,
+    0x2969ca71d82efa7c, 0x9d46e9935ebbba2e, 0xe056b67e05e6822b,
+    0x94d73f55739d03a0, 0xcd7010bdb69b5a03, 0x455ef9fcd79b82f4,
+    0x869cb54a8749c161, 0x38d1a4fa6185d225, 0xb475166f94bbe9bb,
+    0xa4143548720959f1, 0x7aed4780ba6b26ba, 0xd0ce264439e02312,
+    0x84366d746078d508, 0xa8ce973c72ed17be, 0x21c323a29a430b01,
+    0x9962d617e3af80ee, 0xab0ce91d9c8cf75b, 0x530e8ee6d19a4dbc,
+    0x2ef68c0cf53f5d72, 0xc03a681640a85506, 0x496e4e9f9c310967,
+    0x78580472b59b14a0, 0x273824c23b388577, 0x66bf923ad45cb553,
+    0x47ae1a5a2492ba86, 0x35e304569e229659, 0x4765182a46870b6f,
+    0x6cbab625e9099412, 0xddac9a2e598522c1, 0x7172086e666624f2,
+    0xdf5003ca503b7837, 0x88c0c1db78563d09, 0x58d51865acfc289d,
+    0x177671aec65224f1, 0xfb79d8a241e967d7, 0x2be1e101cad9a49a,
+    0x6625682f6e29186b, 0x399553457ac06e50, 0x35dffb4c23abb74,
+    0x429db2591f54aade, 0xc52802a8037d1009, 0x6acb27381f0b25f3,
+    0xf45e2551ee4f823b, 0x8b0ea2d99580c2f7, 0x3bed519cbcb4e1e1,
+    0xff452823dbb010a,  0x9d42ed614f3dd267, 0x5b9313c06257c57b,
+    0xa114b8008b5e1442, 0xc1fe311c11c13d4b, 0x66e8763ea34c5568,
+    0x8b982af1c262f05d, 0xee8876faaa75fbb7, 0x8a62a4d0d172bb2a,
+    0xc13d94a3b7449a97, 0x6dbbba9dc15d037c, 0xc786101f1d92e0f1,
+    0xd78681a907a0b79b, 0xf61aaf2962c9abb9, 0x2cfd16fcd3cb7ad9,
+    0x868c5b6744624d21, 0x25e650899c74ddd7, 0xba042af4a7c37463,
+    0x4eb1a539465a3eca, 0xbe09dbf03b05d5ca, 0x774e5a362b5472ba,
+    0x47a1221229d183cd, 0x504b0ca18ef5a2df, 0xdffbdfbde2456eb9,
+    0x46cd2b2fbee34634, 0xf2aef8fe819d98c3, 0x357f5276d4599d61,
+    0x24a5483879c453e3, 0x88026889192b4b9,  0x28da96671782dbec,
+    0x4ef37c40588e9aaa, 0x8837b90651bc9fb3, 0xc164f741d3f0e5d6,
+    0xbc135a0a704b70ba, 0x69cd868f7622ada,  0xbc37ba89e0b9c0ab,
+    0x47c14a01323552f6, 0x4f00794bacee98bb, 0x7107de7d637a69d5,
+    0x88af793bb6f2255e, 0xf3c6466b8799b598, 0xc288c616aa7f3b59,
+    0x81ca63cf42fca3fd, 0x88d85ace36a2674b, 0xd056bd3792389e7,
+    0xe55c396c4e9dd32d, 0xbefb504571e6c0a6, 0x96ab32115e91e8cc,
+    0xbf8acb18de8f38d1, 0x66dae58801672606, 0x833b6017872317fb,
+    0xb87c16f2d1c92864, 0xdb766a74e58b669c, 0x89659f85c61417be,
+    0xc8daad856011ea0c, 0x76a4b565b6fe7eae, 0xa469d085f6237312,
+    0xaaf0365683a3e96c, 0x4dbb746f8424f7b8, 0x638755af4e4acc1,
+    0x3d7807f5bde64486, 0x17be6d8f5bbb7639, 0x903f0cd44dc35dc,
+    0x67b672eafdf1196c, 0xa676ff93ed4c82f1, 0x521d1004c5053d9d,
+    0x37ba9ad09ccc9202, 0x84e54d297aacfb51, 0xa0b4b776a143445,
+    0x820d471e20b348e,  0x1874383cb83d46dc, 0x97edeec7a1efe11c,
+    0xb330e50b1bdc42aa, 0x1dd91955ce70e032, 0xa514cdb88f2939d5,
+    0x2791233fd90db9d3, 0x7b670a4cc50f7a9b, 0x77c07d2a05c6dfa5,
+    0xe3778b6646d0a6fa, 0xb39c8eda47b56749, 0x933ed448addbef28,
+    0xaf846af6ab7d0bf4, 0xe5af208eb666e49,  0x5e6622f73534cd6a,
+    0x297daeca42ef5b6e, 0x862daef3d35539a6, 0xe68722498f8e1ea9,
+    0x981c53093dc0d572, 0xfa09b0bfbf86fbf5, 0x30b1e96166219f15,
+    0x70e7d466bdc4fb83, 0x5a66736e35f2a8e9, 0xcddb59d2b7c1baef,
+    0xd6c7d247d26d8996, 0xea4e39eac8de1ba3, 0x539c8bb19fa3aff2,
+    0x9f90e4c5fd508d8,  0xa34e5956fbaf3385, 0x2e2f8e151d3ef375,
+    0x173691e9b83faec1, 0xb85a8d56bf016379, 0x8382381267408ae3,
+    0xb90f901bbdc0096d, 0x7c6ad32933bcec65, 0x76bb5e2f2c8ad595,
+    0x390f851a6cf46d28, 0xc3e6064da1c2da72, 0xc52a0c101cfa5389,
+    0xd78eaf84a3fbc530, 0x3781b9e2288b997e, 0x73c2f6dea83d05c4,
+    0x4228e364c5b5ed7,  0x9d7a3edf0da43911, 0x8edcfeda24686756,
+    0x5e7667a7b7a9b3a1, 0x4c4f389fa143791d, 0xb08bc1023da7cddc,
+    0x7ab4be3ae529b1cc, 0x754e6132dbe74ff9, 0x71635442a839df45,
+    0x2f6fb1643fbe52de, 0x961e0a42cf7a8177, 0xf3b45d83d89ef2ea,
+    0xee3de4cf4a6e3e9b, 0xcd6848542c3295e7, 0xe4cee1664c78662f,
+    0x9947548b474c68c4, 0x25d73777a5ed8b0b, 0xc915b1d636b7fc,
+    0x21c2ba75d9b0d2da, 0x5f6b5dcf608a64a1, 0xdcf333255ff9570c,
+    0x633b922418ced4ee, 0xc136dde0b004b34a, 0x58cc83b05d4b2f5a,
+    0x5eb424dda28e42d2, 0x62df47369739cd98, 0xb4e0b42485e4ce17,
+    0x16e1f0c1f9a8d1e7, 0x8ec3916707560ebf, 0x62ba6e2df2cc9db3,
+    0xcbf9f4ff77d83a16, 0x78d9d7d07d2bbcc4, 0xef554ce1e02c41f4,
+    0x8d7581127eccf94d, 0xa9b53336cb3c8a05, 0x38c42c0bf45c4f91,
+    0x640893cdf4488863, 0x80ec34bc575ea568, 0x39f324f5b48eaa40,
+    0xe9d9ed1f8eff527f, 0x9224fc058cc5a214, 0xbaba00b04cfe7741,
+    0x309a9f120fcf52af, 0xa558f3ec65626212, 0x424bec8b7adabe2f,
+    0x41622513a6aea433, 0xb88da2d5324ca798, 0xd287733b245528a4,
+    0x9a44697e6d68aec3, 0x7b1093be2f49bb28, 0x50bbec632e3d8aad,
+    0x6cd90723e1ea8283, 0x897b9e7431b02bf3, 0x219efdcb338a7047,
+    0x3b0311f0a27c0656, 0xdb17bf91c0db96e7, 0x8cd4fd6b4e85a5b2,
+    0xfab071054ba6409d, 0x40d6fe831fa9dfd9, 0xaf358debad7d791e,
+    0xeb8d0e25a65e3e58, 0xbbcbd3df14e08580, 0xcf751f27ecdab2b,
+    0x2b4da14f2613d8f4
+};
+
+#endif /* ZSTD_LDM_GEARTAB_H */
--- a/third_party/zstd/lib/compress/zstd_opt.c
+++ b/third_party/zstd/lib/compress/zstd_opt.c
--- a/third_party/zstd/lib/compress/zstd_opt.h
+++ b/third_party/zstd/lib/compress/zstd_opt.h
@ -0,0 +1,81 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_OPT_H
+#define ZSTD_OPT_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "third_party/zstd/lib/compress/zstd_compress_internal.h"
+
+#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+/* used in ZSTD_loadDictionaryContent() */
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt
+#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState
+#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict
+#else
+#define ZSTD_COMPRESSBLOCK_BTOPT NULL
+#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+        /* note : no btultra2 variant for extDict nor dictMatchState,
+         * because btultra2 is not meant to work with dictionaries
+         * and is only specific for the first block (no prefix) */
+size_t ZSTD_compressBlock_btultra2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra
+#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState
+#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict
+#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2
+#else
+#define ZSTD_COMPRESSBLOCK_BTULTRA NULL
+#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL
+#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL
+#endif
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_OPT_H */
--- a/third_party/zstd/lib/compress/zstdmt_compress.c
+++ b/third_party/zstd/lib/compress/zstdmt_compress.c
--- a/third_party/zstd/lib/compress/zstdmt_compress.h
+++ b/third_party/zstd/lib/compress/zstdmt_compress.h
@ -0,0 +1,114 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ #ifndef ZSTDMT_COMPRESS_H
+ #define ZSTDMT_COMPRESS_H
+
+ #if defined (__cplusplus)
+ extern "C" {
+ #endif
+
+
+/* Note : This is an internal API.
+ *        These APIs used to be exposed with ZSTDLIB_API,
+ *        because it used to be the only way to invoke MT compression.
+ *        Now, you must use ZSTD_compress2 and ZSTD_compressStream2() instead.
+ *
+ *        This API requires ZSTD_MULTITHREAD to be defined during compilation,
+ *        otherwise ZSTDMT_createCCtx*() will fail.
+ */
+
+/* ===   Dependencies   === */
+#include "third_party/zstd/lib/common/zstd_deps.h"   /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_parameters */
+#include "third_party/zstd/zstd.h"            /* ZSTD_inBuffer, ZSTD_outBuffer, ZSTDLIB_API */
+
+
+/* ===   Constants   === */
+#ifndef ZSTDMT_NBWORKERS_MAX /* a different value can be selected at compile time */
+#  define ZSTDMT_NBWORKERS_MAX ((sizeof(void*)==4) /*32-bit*/ ? 64 : 256)
+#endif
+#ifndef ZSTDMT_JOBSIZE_MIN   /* a different value can be selected at compile time */
+#  define ZSTDMT_JOBSIZE_MIN (512 KB)
+#endif
+#define ZSTDMT_JOBLOG_MAX   (MEM_32bits() ? 29 : 30)
+#define ZSTDMT_JOBSIZE_MAX  (MEM_32bits() ? (512 MB) : (1024 MB))
+
+
+/* ========================================================
+ * ===  Private interface, for use by ZSTD_compress.c   ===
+ * ===  Not exposed in libzstd. Never invoke directly   ===
+ * ======================================================== */
+
+/* ===   Memory management   === */
+typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx;
+/* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */
+ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers,
+                                        ZSTD_customMem cMem,
+					ZSTD_threadPool *pool);
+size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx);
+
+size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx);
+
+/* ===   Streaming functions   === */
+
+size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx);
+
+/*! ZSTDMT_initCStream_internal() :
+ *  Private use only. Init streaming operation.
+ *  expects params to be valid.
+ *  must receive dict, or cdict, or none, but not both.
+ *  mtctx can be freshly constructed or reused from a prior compression.
+ *  If mtctx is reused, memory allocations from the prior compression may not be freed,
+ *  even if they are not needed for the current compression.
+ *  @return : 0, or an error code */
+size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* mtctx,
+                    const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
+                    const ZSTD_CDict* cdict,
+                    ZSTD_CCtx_params params, unsigned long long pledgedSrcSize);
+
+/*! ZSTDMT_compressStream_generic() :
+ *  Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream()
+ *  depending on flush directive.
+ * @return : minimum amount of data still to be flushed
+ *           0 if fully flushed
+ *           or an error code
+ *  note : needs to be init using any ZSTD_initCStream*() variant */
+size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
+                                     ZSTD_outBuffer* output,
+                                     ZSTD_inBuffer* input,
+                                     ZSTD_EndDirective endOp);
+
+ /*! ZSTDMT_toFlushNow()
+  *  Tell how many bytes are ready to be flushed immediately.
+  *  Probe the oldest active job (not yet entirely flushed) and check its output buffer.
+  *  If return 0, it means there is no active job,
+  *  or, it means oldest job is still active, but everything produced has been flushed so far,
+  *  therefore flushing is limited by speed of oldest job. */
+size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx);
+
+/*! ZSTDMT_updateCParams_whileCompressing() :
+ *  Updates only a selected set of compression parameters, to remain compatible with current frame.
+ *  New parameters will be applied to next compression job. */
+void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
+
+/*! ZSTDMT_getFrameProgression():
+ *  tells how much data has been consumed (input) and produced (output) for current frame.
+ *  able to count progression inside worker threads.
+ */
+ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTDMT_COMPRESS_H */
--- a/third_party/zstd/lib/decompress/huf_decompress.c
+++ b/third_party/zstd/lib/decompress/huf_decompress.c
--- a/third_party/zstd/lib/decompress/huf_decompress_amd64.S
+++ b/third_party/zstd/lib/decompress/huf_decompress_amd64.S
@ -0,0 +1,577 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "third_party/zstd/lib/common/portability_macros.h"
+
+/* Stack marking
+ * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
+ */
+#if defined(__ELF__) && defined(__GNUC__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2
+
+/* Calling convention:
+ *
+ * %rdi contains the first argument: HUF_DecompressAsmArgs*.
+ * %rbp isn't maintained (no frame pointer).
+ * %rsp contains the stack pointer that grows down.
+ *      No red-zone is assumed, only addresses >= %rsp are used.
+ * All register contents are preserved.
+ *
+ * TODO: Support Windows calling convention.
+ */
+
+ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
+ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
+ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
+ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
+.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
+.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
+.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
+.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
+.text
+
+/* Sets up register mappings for clarity.
+ * op[], bits[], dtable & ip[0] each get their own register.
+ * ip[1,2,3] & olimit alias var[].
+ * %rax is a scratch register.
+ */
+
+#define op0    rsi
+#define op1    rbx
+#define op2    rcx
+#define op3    rdi
+
+#define ip0    r8
+#define ip1    r9
+#define ip2    r10
+#define ip3    r11
+
+#define bits0  rbp
+#define bits1  rdx
+#define bits2  r12
+#define bits3  r13
+#define dtable r14
+#define olimit r15
+
+/* var[] aliases ip[1,2,3] & olimit
+ * ip[1,2,3] are saved every iteration.
+ * olimit is only used in compute_olimit.
+ */
+#define var0   r15
+#define var1   r9
+#define var2   r10
+#define var3   r11
+
+/* 32-bit var registers */
+#define vard0  r15d
+#define vard1  r9d
+#define vard2  r10d
+#define vard3  r11d
+
+/* Calls X(N) for each stream 0, 1, 2, 3. */
+#define FOR_EACH_STREAM(X) \
+    X(0);                  \
+    X(1);                  \
+    X(2);                  \
+    X(3)
+
+/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
+#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
+    X(0, idx);                             \
+    X(1, idx);                             \
+    X(2, idx);                             \
+    X(3, idx)
+
+/* Define both _HUF_* & HUF_* symbols because MacOS
+ * C symbols are prefixed with '_' & Linux symbols aren't.
+ */
+_HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
+HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
+    ZSTD_CET_ENDBRANCH
+    /* Save all registers - even if they are callee saved for simplicity. */
+    push %rax
+    push %rbx
+    push %rcx
+    push %rdx
+    push %rbp
+    push %rsi
+    push %rdi
+    push %r8
+    push %r9
+    push %r10
+    push %r11
+    push %r12
+    push %r13
+    push %r14
+    push %r15
+
+    /* Read HUF_DecompressAsmArgs* args from %rax */
+    movq %rdi, %rax
+    movq  0(%rax), %ip0
+    movq  8(%rax), %ip1
+    movq 16(%rax), %ip2
+    movq 24(%rax), %ip3
+    movq 32(%rax), %op0
+    movq 40(%rax), %op1
+    movq 48(%rax), %op2
+    movq 56(%rax), %op3
+    movq 64(%rax), %bits0
+    movq 72(%rax), %bits1
+    movq 80(%rax), %bits2
+    movq 88(%rax), %bits3
+    movq 96(%rax), %dtable
+    push %rax      /* argument */
+    push 104(%rax) /* ilimit */
+    push 112(%rax) /* oend */
+    push %olimit   /* olimit space */
+
+    subq $24, %rsp
+
+.L_4X1_compute_olimit:
+    /* Computes how many iterations we can do safely
+     * %r15, %rax may be clobbered
+     * rbx, rdx must be saved
+     * op3 & ip0 mustn't be clobbered
+     */
+    movq %rbx, 0(%rsp)
+    movq %rdx, 8(%rsp)
+
+    movq 32(%rsp), %rax /* rax = oend */
+    subq %op3,    %rax  /* rax = oend - op3 */
+
+    /* r15 = (oend - op3) / 5 */
+    movabsq $-3689348814741910323, %rdx
+    mulq %rdx
+    movq %rdx, %r15
+    shrq $2, %r15
+
+    movq %ip0,     %rax /* rax = ip0 */
+    movq 40(%rsp), %rdx /* rdx = ilimit */
+    subq %rdx,     %rax /* rax = ip0 - ilimit */
+    movq %rax,     %rbx /* rbx = ip0 - ilimit */
+
+    /* rdx = (ip0 - ilimit) / 7 */
+    movabsq $2635249153387078803, %rdx
+    mulq %rdx
+    subq %rdx, %rbx
+    shrq %rbx
+    addq %rbx, %rdx
+    shrq $2, %rdx
+
+    /* r15 = min(%rdx, %r15) */
+    cmpq %rdx, %r15
+    cmova %rdx, %r15
+
+    /* r15 = r15 * 5 */
+    leaq (%r15, %r15, 4), %r15
+
+    /* olimit = op3 + r15 */
+    addq %op3, %olimit
+
+    movq 8(%rsp), %rdx
+    movq 0(%rsp), %rbx
+
+    /* If (op3 + 20 > olimit) */
+    movq %op3, %rax    /* rax = op3 */
+    addq $20,  %rax    /* rax = op3 + 20 */
+    cmpq %rax, %olimit /* op3 + 20 > olimit */
+    jb .L_4X1_exit
+
+    /* If (ip1 < ip0) go to exit */
+    cmpq %ip0, %ip1
+    jb .L_4X1_exit
+
+    /* If (ip2 < ip1) go to exit */
+    cmpq %ip1, %ip2
+    jb .L_4X1_exit
+
+    /* If (ip3 < ip2) go to exit */
+    cmpq %ip2, %ip3
+    jb .L_4X1_exit
+
+/* Reads top 11 bits from bits[n]
+ * Loads dt[bits[n]] into var[n]
+ */
+#define GET_NEXT_DELT(n)                \
+    movq $53, %var##n;                  \
+    shrxq %var##n, %bits##n, %var##n;   \
+    movzwl (%dtable,%var##n,2),%vard##n
+
+/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
+ * Moves var[n] to %rax
+ * bits[n] <<= var[n] & 63
+ * op[n][idx] = %rax >> 8
+ * %ah is a way to access bits [8, 16) of %rax
+ */
+#define DECODE_FROM_DELT(n, idx)       \
+    movq %var##n, %rax;                \
+    shlxq %var##n, %bits##n, %bits##n; \
+    movb %ah, idx(%op##n)
+
+/* Assumes GET_NEXT_DELT has been called.
+ * Calls DECODE_FROM_DELT then GET_NEXT_DELT
+ */
+#define DECODE_AND_GET_NEXT(n, idx) \
+    DECODE_FROM_DELT(n, idx);       \
+    GET_NEXT_DELT(n)                \
+
+/* // ctz & nbBytes is stored in bits[n]
+ * // nbBits is stored in %rax
+ * ctz  = CTZ[bits[n]]
+ * nbBits  = ctz & 7
+ * nbBytes = ctz >> 3
+ * op[n]  += 5
+ * ip[n]  -= nbBytes
+ * // Note: x86-64 is little-endian ==> no bswap
+ * bits[n] = MEM_readST(ip[n]) | 1
+ * bits[n] <<= nbBits
+ */
+#define RELOAD_BITS(n)             \
+    bsfq %bits##n, %bits##n;       \
+    movq %bits##n, %rax;           \
+    andq $7, %rax;                 \
+    shrq $3, %bits##n;             \
+    leaq 5(%op##n), %op##n;        \
+    subq %bits##n, %ip##n;         \
+    movq (%ip##n), %bits##n;       \
+    orq $1, %bits##n;              \
+    shlx %rax, %bits##n, %bits##n
+
+    /* Store clobbered variables on the stack */
+    movq %olimit, 24(%rsp)
+    movq %ip1, 0(%rsp)
+    movq %ip2, 8(%rsp)
+    movq %ip3, 16(%rsp)
+
+    /* Call GET_NEXT_DELT for each stream */
+    FOR_EACH_STREAM(GET_NEXT_DELT)
+
+    .p2align 6
+
+.L_4X1_loop_body:
+    /* Decode 5 symbols in each of the 4 streams (20 total)
+     * Must have called GET_NEXT_DELT for each stream
+     */
+    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
+
+    /* Load ip[1,2,3] from stack (var[] aliases them)
+     * ip[] is needed for RELOAD_BITS
+     * Each will be stored back to the stack after RELOAD
+     */
+    movq 0(%rsp), %ip1
+    movq 8(%rsp), %ip2
+    movq 16(%rsp), %ip3
+
+    /* Reload each stream & fetch the next table entry
+     * to prepare for the next iteration
+     */
+    RELOAD_BITS(0)
+    GET_NEXT_DELT(0)
+
+    RELOAD_BITS(1)
+    movq %ip1, 0(%rsp)
+    GET_NEXT_DELT(1)
+
+    RELOAD_BITS(2)
+    movq %ip2, 8(%rsp)
+    GET_NEXT_DELT(2)
+
+    RELOAD_BITS(3)
+    movq %ip3, 16(%rsp)
+    GET_NEXT_DELT(3)
+
+    /* If op3 < olimit: continue the loop */
+    cmp %op3, 24(%rsp)
+    ja .L_4X1_loop_body
+
+    /* Reload ip[1,2,3] from stack */
+    movq 0(%rsp), %ip1
+    movq 8(%rsp), %ip2
+    movq 16(%rsp), %ip3
+
+    /* Re-compute olimit */
+    jmp .L_4X1_compute_olimit
+
+#undef GET_NEXT_DELT
+#undef DECODE_FROM_DELT
+#undef DECODE
+#undef RELOAD_BITS
+.L_4X1_exit:
+    addq $24, %rsp
+
+    /* Restore stack (oend & olimit) */
+    pop %rax /* olimit */
+    pop %rax /* oend */
+    pop %rax /* ilimit */
+    pop %rax /* arg */
+
+    /* Save ip / op / bits */
+    movq %ip0,  0(%rax)
+    movq %ip1,  8(%rax)
+    movq %ip2, 16(%rax)
+    movq %ip3, 24(%rax)
+    movq %op0, 32(%rax)
+    movq %op1, 40(%rax)
+    movq %op2, 48(%rax)
+    movq %op3, 56(%rax)
+    movq %bits0, 64(%rax)
+    movq %bits1, 72(%rax)
+    movq %bits2, 80(%rax)
+    movq %bits3, 88(%rax)
+
+    /* Restore registers */
+    pop %r15
+    pop %r14
+    pop %r13
+    pop %r12
+    pop %r11
+    pop %r10
+    pop %r9
+    pop %r8
+    pop %rdi
+    pop %rsi
+    pop %rbp
+    pop %rdx
+    pop %rcx
+    pop %rbx
+    pop %rax
+    ret
+
+_HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
+HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
+    ZSTD_CET_ENDBRANCH
+    /* Save all registers - even if they are callee saved for simplicity. */
+    push %rax
+    push %rbx
+    push %rcx
+    push %rdx
+    push %rbp
+    push %rsi
+    push %rdi
+    push %r8
+    push %r9
+    push %r10
+    push %r11
+    push %r12
+    push %r13
+    push %r14
+    push %r15
+
+    movq %rdi, %rax
+    movq  0(%rax), %ip0
+    movq  8(%rax), %ip1
+    movq 16(%rax), %ip2
+    movq 24(%rax), %ip3
+    movq 32(%rax), %op0
+    movq 40(%rax), %op1
+    movq 48(%rax), %op2
+    movq 56(%rax), %op3
+    movq 64(%rax), %bits0
+    movq 72(%rax), %bits1
+    movq 80(%rax), %bits2
+    movq 88(%rax), %bits3
+    movq 96(%rax), %dtable
+    push %rax      /* argument */
+    push %rax      /* olimit */
+    push 104(%rax) /* ilimit */
+
+    movq 112(%rax), %rax
+    push %rax /* oend3 */
+
+    movq %op3, %rax
+    push %rax /* oend2 */
+
+    movq %op2, %rax
+    push %rax /* oend1 */
+
+    movq %op1, %rax
+    push %rax /* oend0 */
+
+    /* Scratch space */
+    subq $8, %rsp
+
+.L_4X2_compute_olimit:
+    /* Computes how many iterations we can do safely
+     * %r15, %rax may be clobbered
+     * rdx must be saved
+     * op[1,2,3,4] & ip0 mustn't be clobbered
+     */
+    movq %rdx, 0(%rsp)
+
+    /* We can consume up to 7 input bytes each iteration. */
+    movq %ip0,     %rax  /* rax = ip0 */
+    movq 40(%rsp), %rdx  /* rdx = ilimit */
+    subq %rdx,     %rax  /* rax = ip0 - ilimit */
+    movq %rax,    %r15   /* r15 = ip0 - ilimit */
+
+    /* rdx = rax / 7 */
+    movabsq $2635249153387078803, %rdx
+    mulq %rdx
+    subq %rdx, %r15
+    shrq %r15
+    addq %r15, %rdx
+    shrq $2, %rdx
+
+    /* r15 = (ip0 - ilimit) / 7 */
+    movq %rdx, %r15
+
+    /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
+    movq 8(%rsp),  %rax /* rax = oend0 */
+    subq %op0,     %rax /* rax = oend0 - op0 */
+    movq 16(%rsp), %rdx /* rdx = oend1 */
+    subq %op1,     %rdx /* rdx = oend1 - op1 */
+
+    cmpq  %rax,    %rdx
+    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
+
+    movq 24(%rsp), %rax /* rax = oend2 */
+    subq %op2,     %rax /* rax = oend2 - op2 */
+
+    cmpq  %rax,    %rdx
+    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
+
+    movq 32(%rsp), %rax /* rax = oend3 */
+    subq %op3,     %rax /* rax = oend3 - op3 */
+
+    cmpq  %rax,    %rdx
+    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
+
+    movabsq $-3689348814741910323, %rax
+    mulq %rdx
+    shrq $3,       %rdx /* rdx = rdx / 10 */
+
+    /* r15 = min(%rdx, %r15) */
+    cmpq  %rdx, %r15
+    cmova %rdx, %r15
+
+    /* olimit = op3 + 5 * r15 */
+    movq %r15, %rax
+    leaq (%op3, %rax, 4), %olimit
+    addq %rax, %olimit
+
+    movq 0(%rsp), %rdx
+
+    /* If (op3 + 10 > olimit) */
+    movq %op3, %rax    /* rax = op3 */
+    addq $10,  %rax    /* rax = op3 + 10 */
+    cmpq %rax, %olimit /* op3 + 10 > olimit */
+    jb .L_4X2_exit
+
+    /* If (ip1 < ip0) go to exit */
+    cmpq %ip0, %ip1
+    jb .L_4X2_exit
+
+    /* If (ip2 < ip1) go to exit */
+    cmpq %ip1, %ip2
+    jb .L_4X2_exit
+
+    /* If (ip3 < ip2) go to exit */
+    cmpq %ip2, %ip3
+    jb .L_4X2_exit
+
+#define DECODE(n, idx)              \
+    movq %bits##n, %rax;            \
+    shrq $53, %rax;                 \
+    movzwl 0(%dtable,%rax,4),%r8d;  \
+    movzbl 2(%dtable,%rax,4),%r15d; \
+    movzbl 3(%dtable,%rax,4),%eax;  \
+    movw %r8w, (%op##n);            \
+    shlxq %r15, %bits##n, %bits##n; \
+    addq %rax, %op##n
+
+#define RELOAD_BITS(n)              \
+    bsfq %bits##n, %bits##n;        \
+    movq %bits##n, %rax;            \
+    shrq $3, %bits##n;              \
+    andq $7, %rax;                  \
+    subq %bits##n, %ip##n;          \
+    movq (%ip##n), %bits##n;        \
+    orq $1, %bits##n;               \
+    shlxq %rax, %bits##n, %bits##n
+
+
+    movq %olimit, 48(%rsp)
+
+    .p2align 6
+
+.L_4X2_loop_body:
+    /* We clobber r8, so store it on the stack */
+    movq %r8, 0(%rsp)
+
+    /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
+    FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
+    FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
+
+    /* Reload r8 */
+    movq 0(%rsp), %r8
+
+    FOR_EACH_STREAM(RELOAD_BITS)
+
+    cmp %op3, 48(%rsp)
+    ja .L_4X2_loop_body
+    jmp .L_4X2_compute_olimit
+
+#undef DECODE
+#undef RELOAD_BITS
+.L_4X2_exit:
+    addq $8, %rsp
+    /* Restore stack (oend & olimit) */
+    pop %rax /* oend0 */
+    pop %rax /* oend1 */
+    pop %rax /* oend2 */
+    pop %rax /* oend3 */
+    pop %rax /* ilimit */
+    pop %rax /* olimit */
+    pop %rax /* arg */
+
+    /* Save ip / op / bits */
+    movq %ip0,  0(%rax)
+    movq %ip1,  8(%rax)
+    movq %ip2, 16(%rax)
+    movq %ip3, 24(%rax)
+    movq %op0, 32(%rax)
+    movq %op1, 40(%rax)
+    movq %op2, 48(%rax)
+    movq %op3, 56(%rax)
+    movq %bits0, 64(%rax)
+    movq %bits1, 72(%rax)
+    movq %bits2, 80(%rax)
+    movq %bits3, 88(%rax)
+
+    /* Restore registers */
+    pop %r15
+    pop %r14
+    pop %r13
+    pop %r12
+    pop %r11
+    pop %r10
+    pop %r9
+    pop %r8
+    pop %rdi
+    pop %rsi
+    pop %rbp
+    pop %rdx
+    pop %rcx
+    pop %rbx
+    pop %rax
+    ret
+
+#endif
--- a/third_party/zstd/lib/decompress/zstd_ddict.c
+++ b/third_party/zstd/lib/decompress/zstd_ddict.c
@ -0,0 +1,245 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* zstd_ddict.c :
+ * concentrates all logic that needs to know the internals of ZSTD_DDict object */
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include "third_party/zstd/lib/common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+#include "third_party/zstd/lib/common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+#include "third_party/zstd/lib/common/cpu.h"         /* bmi2 */
+#include "third_party/zstd/lib/common/mem.h"         /* low level memory routines */
+#define FSE_STATIC_LINKING_ONLY
+#include "third_party/zstd/lib/common/fse.h"
+#include "third_party/zstd/lib/common/huf.h"
+#include "third_party/zstd/lib/decompress/zstd_decompress_internal.h"
+#include "third_party/zstd/lib/decompress/zstd_ddict.h"
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+#include "third_party/zstd/lib/legacy/zstd_legacy.h"
+#endif
+
+
+
+/*-*******************************************************
+*  Types
+*********************************************************/
+struct ZSTD_DDict_s {
+    void* dictBuffer;
+    const void* dictContent;
+    size_t dictSize;
+    ZSTD_entropyDTables_t entropy;
+    U32 dictID;
+    U32 entropyPresent;
+    ZSTD_customMem cMem;
+};  /* typedef'd to ZSTD_DDict within "zstd.h" */
+
+const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict)
+{
+    assert(ddict != NULL);
+    return ddict->dictContent;
+}
+
+size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict)
+{
+    assert(ddict != NULL);
+    return ddict->dictSize;
+}
+
+void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    DEBUGLOG(4, "ZSTD_copyDDictParameters");
+    assert(dctx != NULL);
+    assert(ddict != NULL);
+    dctx->dictID = ddict->dictID;
+    dctx->prefixStart = ddict->dictContent;
+    dctx->virtualStart = ddict->dictContent;
+    dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+    dctx->previousDstEnd = dctx->dictEnd;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    dctx->dictContentBeginForFuzzing = dctx->prefixStart;
+    dctx->dictContentEndForFuzzing = dctx->previousDstEnd;
+#endif
+    if (ddict->entropyPresent) {
+        dctx->litEntropy = 1;
+        dctx->fseEntropy = 1;
+        dctx->LLTptr = ddict->entropy.LLTable;
+        dctx->MLTptr = ddict->entropy.MLTable;
+        dctx->OFTptr = ddict->entropy.OFTable;
+        dctx->HUFptr = ddict->entropy.hufTable;
+        dctx->entropy.rep[0] = ddict->entropy.rep[0];
+        dctx->entropy.rep[1] = ddict->entropy.rep[1];
+        dctx->entropy.rep[2] = ddict->entropy.rep[2];
+    } else {
+        dctx->litEntropy = 0;
+        dctx->fseEntropy = 0;
+    }
+}
+
+
+static size_t
+ZSTD_loadEntropy_intoDDict(ZSTD_DDict* ddict,
+                           ZSTD_dictContentType_e dictContentType)
+{
+    ddict->dictID = 0;
+    ddict->entropyPresent = 0;
+    if (dictContentType == ZSTD_dct_rawContent) return 0;
+
+    if (ddict->dictSize < 8) {
+        if (dictContentType == ZSTD_dct_fullDict)
+            return ERROR(dictionary_corrupted);   /* only accept specified dictionaries */
+        return 0;   /* pure content mode */
+    }
+    {   U32 const magic = MEM_readLE32(ddict->dictContent);
+        if (magic != ZSTD_MAGIC_DICTIONARY) {
+            if (dictContentType == ZSTD_dct_fullDict)
+                return ERROR(dictionary_corrupted);   /* only accept specified dictionaries */
+            return 0;   /* pure content mode */
+        }
+    }
+    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE);
+
+    /* load entropy tables */
+    RETURN_ERROR_IF(ZSTD_isError(ZSTD_loadDEntropy(
+            &ddict->entropy, ddict->dictContent, ddict->dictSize)),
+        dictionary_corrupted, "");
+    ddict->entropyPresent = 1;
+    return 0;
+}
+
+
+static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType)
+{
+    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
+        ddict->dictBuffer = NULL;
+        ddict->dictContent = dict;
+        if (!dict) dictSize = 0;
+    } else {
+        void* const internalBuffer = ZSTD_customMalloc(dictSize, ddict->cMem);
+        ddict->dictBuffer = internalBuffer;
+        ddict->dictContent = internalBuffer;
+        if (!internalBuffer) return ERROR(memory_allocation);
+        ZSTD_memcpy(internalBuffer, dict, dictSize);
+    }
+    ddict->dictSize = dictSize;
+    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
+
+    /* parse dictionary content */
+    FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+
+    return 0;
+}
+
+ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType,
+                                      ZSTD_customMem customMem)
+{
+    if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
+
+    {   ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_customMalloc(sizeof(ZSTD_DDict), customMem);
+        if (ddict == NULL) return NULL;
+        ddict->cMem = customMem;
+        {   size_t const initResult = ZSTD_initDDict_internal(ddict,
+                                            dict, dictSize,
+                                            dictLoadMethod, dictContentType);
+            if (ZSTD_isError(initResult)) {
+                ZSTD_freeDDict(ddict);
+                return NULL;
+        }   }
+        return ddict;
+    }
+}
+
+/*! ZSTD_createDDict() :
+*   Create a digested dictionary, to start decompression without startup delay.
+*   `dict` content is copied inside DDict.
+*   Consequently, `dict` can be released after `ZSTD_DDict` creation */
+ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator);
+}
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, to start decompression without startup delay.
+ *  Dictionary content is simply referenced, it will be accessed during decompression.
+ *  Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */
+ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator);
+}
+
+
+const ZSTD_DDict* ZSTD_initStaticDDict(
+                                void* sBuffer, size_t sBufferSize,
+                                const void* dict, size_t dictSize,
+                                ZSTD_dictLoadMethod_e dictLoadMethod,
+                                ZSTD_dictContentType_e dictContentType)
+{
+    size_t const neededSpace = sizeof(ZSTD_DDict)
+                             + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+    ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer;
+    assert(sBuffer != NULL);
+    assert(dict != NULL);
+    if ((size_t)sBuffer & 7) return NULL;   /* 8-aligned */
+    if (sBufferSize < neededSpace) return NULL;
+    if (dictLoadMethod == ZSTD_dlm_byCopy) {
+        ZSTD_memcpy(ddict+1, dict, dictSize);  /* local copy */
+        dict = ddict+1;
+    }
+    if (ZSTD_isError( ZSTD_initDDict_internal(ddict,
+                                              dict, dictSize,
+                                              ZSTD_dlm_byRef, dictContentType) ))
+        return NULL;
+    return ddict;
+}
+
+
+size_t ZSTD_freeDDict(ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = ddict->cMem;
+        ZSTD_customFree(ddict->dictBuffer, cMem);
+        ZSTD_customFree(ddict, cMem);
+        return 0;
+    }
+}
+
+/*! ZSTD_estimateDDictSize() :
+ *  Estimate amount of memory that will be needed to create a dictionary for decompression.
+ *  Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */
+size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+    return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+}
+
+size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support sizeof on NULL */
+    return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ;
+}
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;
+    return ddict->dictID;
+}
--- a/third_party/zstd/lib/decompress/zstd_ddict.h
+++ b/third_party/zstd/lib/decompress/zstd_ddict.h
@ -0,0 +1,45 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#ifndef ZSTD_DDICT_H
+#define ZSTD_DDICT_H
+
+/*-*******************************************************
+ *  Dependencies
+ *********************************************************/
+#include "third_party/zstd/lib/common/zstd_deps.h"   /* size_t */
+#include "third_party/zstd/zstd.h"     /* ZSTD_DDict, and several public functions */
+
+
+/*-*******************************************************
+ *  Interface
+ *********************************************************/
+
+/* note: several prototypes are already published in `zstd.h` :
+ * ZSTD_createDDict()
+ * ZSTD_createDDict_byReference()
+ * ZSTD_createDDict_advanced()
+ * ZSTD_freeDDict()
+ * ZSTD_initStaticDDict()
+ * ZSTD_sizeof_DDict()
+ * ZSTD_estimateDDictSize()
+ * ZSTD_getDictID_fromDict()
+ */
+
+const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict);
+size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict);
+
+void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+
+
+#endif /* ZSTD_DDICT_H */
--- a/third_party/zstd/lib/decompress/zstd_decompress.c
+++ b/third_party/zstd/lib/decompress/zstd_decompress.c
--- a/third_party/zstd/lib/decompress/zstd_decompress_block.c
+++ b/third_party/zstd/lib/decompress/zstd_decompress_block.c
--- a/third_party/zstd/lib/decompress/zstd_decompress_block.h
+++ b/third_party/zstd/lib/decompress/zstd_decompress_block.h
@ -0,0 +1,74 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#ifndef ZSTD_DEC_BLOCK_H
+#define ZSTD_DEC_BLOCK_H
+
+/*-*******************************************************
+ *  Dependencies
+ *********************************************************/
+#include "third_party/zstd/lib/common/zstd_deps.h"   /* size_t */
+#include "third_party/zstd/zstd.h"    /* DCtx, and some public functions */
+#include "third_party/zstd/lib/common/zstd_internal.h"  /* blockProperties_t, and some public functions */
+#include "third_party/zstd/lib/decompress/zstd_decompress_internal.h"  /* ZSTD_seqSymbol */
+
+
+/* ===   Prototypes   === */
+
+/* note: prototypes already published within `zstd.h` :
+ * ZSTD_decompressBlock()
+ */
+
+/* note: prototypes already published within `zstd_internal.h` :
+ * ZSTD_getcBlockSize()
+ * ZSTD_decodeSeqHeaders()
+ */
+
+
+ /* Streaming state is used to inform allocation of the literal buffer */
+typedef enum {
+    not_streaming = 0,
+    is_streaming = 1
+} streaming_operation;
+
+/* ZSTD_decompressBlock_internal() :
+ * decompress block, starting at `src`,
+ * into destination buffer `dst`.
+ * @return : decompressed block size,
+ *           or an error code (which can be tested using ZSTD_isError())
+ */
+size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize, const streaming_operation streaming);
+
+/* ZSTD_buildFSETable() :
+ * generate FSE decoding table for one symbol (ll, ml or off)
+ * this function must be called with valid parameters only
+ * (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.)
+ * in which case it cannot fail.
+ * The workspace must be 4-byte aligned and at least ZSTD_BUILD_FSE_TABLE_WKSP_SIZE bytes, which is
+ * defined in zstd_decompress_internal.h.
+ * Internal use only.
+ */
+void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+             const short* normalizedCounter, unsigned maxSymbolValue,
+             const U32* baseValue, const U8* nbAdditionalBits,
+                   unsigned tableLog, void* wksp, size_t wkspSize,
+                   int bmi2);
+
+/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize);
+
+
+#endif /* ZSTD_DEC_BLOCK_H */
--- a/third_party/zstd/lib/decompress/zstd_decompress_internal.h
+++ b/third_party/zstd/lib/decompress/zstd_decompress_internal.h
@ -0,0 +1,241 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* zstd_decompress_internal:
+ * objects and definitions shared within lib/decompress modules */
+
+ #ifndef ZSTD_DECOMPRESS_INTERNAL_H
+ #define ZSTD_DECOMPRESS_INTERNAL_H
+
+
+/*-*******************************************************
+ *  Dependencies
+ *********************************************************/
+#include "third_party/zstd/lib/common/mem.h"             /* BYTE, U16, U32 */
+#include "third_party/zstd/lib/common/zstd_internal.h"   /* constants : MaxLL, MaxML, MaxOff, LLFSELog, etc. */
+
+
+
+/*-*******************************************************
+ *  Constants
+ *********************************************************/
+static UNUSED_ATTR const U32 LL_base[MaxLL+1] = {
+                 0,    1,    2,     3,     4,     5,     6,      7,
+                 8,    9,   10,    11,    12,    13,    14,     15,
+                16,   18,   20,    22,    24,    28,    32,     40,
+                48,   64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                0x2000, 0x4000, 0x8000, 0x10000 };
+
+static UNUSED_ATTR const U32 OF_base[MaxOff+1] = {
+                 0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
+                 0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
+                 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+                 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD };
+
+static UNUSED_ATTR const U8 OF_bits[MaxOff+1] = {
+                     0,  1,  2,  3,  4,  5,  6,  7,
+                     8,  9, 10, 11, 12, 13, 14, 15,
+                    16, 17, 18, 19, 20, 21, 22, 23,
+                    24, 25, 26, 27, 28, 29, 30, 31 };
+
+static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
+                     3,  4,  5,    6,     7,     8,     9,    10,
+                    11, 12, 13,   14,    15,    16,    17,    18,
+                    19, 20, 21,   22,    23,    24,    25,    26,
+                    27, 28, 29,   30,    31,    32,    33,    34,
+                    35, 37, 39,   41,    43,    47,    51,    59,
+                    67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+                    0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
+
+
+/*-*******************************************************
+ *  Decompression types
+ *********************************************************/
+ typedef struct {
+     U32 fastMode;
+     U32 tableLog;
+ } ZSTD_seqSymbol_header;
+
+ typedef struct {
+     U16  nextState;
+     BYTE nbAdditionalBits;
+     BYTE nbBits;
+     U32  baseValue;
+ } ZSTD_seqSymbol;
+
+ #define SEQSYMBOL_TABLE_SIZE(log)   (1 + (1 << (log)))
+
+#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
+#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
+#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
+
+typedef struct {
+    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
+    U32 rep[ZSTD_REP_NUM];
+    U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
+} ZSTD_entropyDTables_t;
+
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
+               ZSTDds_decompressLastBlock, ZSTDds_checkChecksum,
+               ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage;
+
+typedef enum { zdss_init=0, zdss_loadHeader,
+               zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
+
+typedef enum {
+    ZSTD_use_indefinitely = -1,  /* Use the dictionary indefinitely */
+    ZSTD_dont_use = 0,           /* Do not use the dictionary (if one exists free it) */
+    ZSTD_use_once = 1            /* Use the dictionary once and set to ZSTD_dont_use */
+} ZSTD_dictUses_e;
+
+/* Hashset for storing references to multiple ZSTD_DDict within ZSTD_DCtx */
+typedef struct {
+    const ZSTD_DDict** ddictPtrTable;
+    size_t ddictPtrTableSize;
+    size_t ddictPtrCount;
+} ZSTD_DDictHashSet;
+
+#ifndef ZSTD_DECODER_INTERNAL_BUFFER
+#  define ZSTD_DECODER_INTERNAL_BUFFER  (1 << 16)
+#endif
+
+#define ZSTD_LBMIN 64
+#define ZSTD_LBMAX (128 << 10)
+
+/* extra buffer, compensates when dst is not large enough to store litBuffer */
+#define ZSTD_LITBUFFEREXTRASIZE  BOUNDED(ZSTD_LBMIN, ZSTD_DECODER_INTERNAL_BUFFER, ZSTD_LBMAX)
+
+typedef enum {
+    ZSTD_not_in_dst = 0,  /* Stored entirely within litExtraBuffer */
+    ZSTD_in_dst = 1,           /* Stored entirely within dst (in memory after current output write) */
+    ZSTD_split = 2            /* Split between litExtraBuffer and dst */
+} ZSTD_litLocation_e;
+
+struct ZSTD_DCtx_s
+{
+    const ZSTD_seqSymbol* LLTptr;
+    const ZSTD_seqSymbol* MLTptr;
+    const ZSTD_seqSymbol* OFTptr;
+    const HUF_DTable* HUFptr;
+    ZSTD_entropyDTables_t entropy;
+    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];   /* space needed when building huffman tables */
+    const void* previousDstEnd;   /* detect continuity */
+    const void* prefixStart;      /* start of current segment */
+    const void* virtualStart;     /* virtual start of previous segment if it was just before current one */
+    const void* dictEnd;          /* end of previous segment */
+    size_t expected;
+    ZSTD_frameHeader fParams;
+    U64 processedCSize;
+    U64 decodedSize;
+    blockType_e bType;            /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
+    ZSTD_dStage stage;
+    U32 litEntropy;
+    U32 fseEntropy;
+    XXH64_state_t xxhState;
+    size_t headerSize;
+    ZSTD_format_e format;
+    ZSTD_forceIgnoreChecksum_e forceIgnoreChecksum;   /* User specified: if == 1, will ignore checksums in compressed frame. Default == 0 */
+    U32 validateChecksum;         /* if == 1, will validate checksum. Is == 1 if (fParams.checksumFlag == 1) and (forceIgnoreChecksum == 0). */
+    const BYTE* litPtr;
+    ZSTD_customMem customMem;
+    size_t litSize;
+    size_t rleSize;
+    size_t staticSize;
+    int isFrameDecompression;
+#if DYNAMIC_BMI2 != 0
+    int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+#endif
+
+    /* dictionary */
+    ZSTD_DDict* ddictLocal;
+    const ZSTD_DDict* ddict;     /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
+    U32 dictID;
+    int ddictIsCold;             /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
+    ZSTD_dictUses_e dictUses;
+    ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
+    ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
+    int disableHufAsm;
+    int maxBlockSizeParam;
+
+    /* streaming */
+    ZSTD_dStreamStage streamStage;
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    size_t maxWindowSize;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t lhSize;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+    void* legacyContext;
+    U32 previousLegacyVersion;
+    U32 legacyVersion;
+#endif
+    U32 hostageByte;
+    int noForwardProgress;
+    ZSTD_bufferMode_e outBufferMode;
+    ZSTD_outBuffer expectedOutBuffer;
+
+    /* workspace */
+    BYTE* litBuffer;
+    const BYTE* litBufferEnd;
+    ZSTD_litLocation_e litBufferLocation;
+    BYTE litExtraBuffer[ZSTD_LITBUFFEREXTRASIZE + WILDCOPY_OVERLENGTH]; /* literal buffer can be split between storage within dst and within this scratch buffer */
+    BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
+
+    size_t oversizedDuration;
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    void const* dictContentBeginForFuzzing;
+    void const* dictContentEndForFuzzing;
+#endif
+
+    /* Tracing */
+#if ZSTD_TRACE
+    ZSTD_TraceCtx traceCtx;
+#endif
+};  /* typedef'd to ZSTD_DCtx within "zstd.h" */
+
+MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) {
+#if DYNAMIC_BMI2 != 0
+	return dctx->bmi2;
+#else
+    (void)dctx;
+	return 0;
+#endif
+}
+
+/*-*******************************************************
+ *  Shared internal functions
+ *********************************************************/
+
+/*! ZSTD_loadDEntropy() :
+ *  dict : must point at beginning of a valid zstd dictionary.
+ * @return : size of dictionary header (size of magic number + dict ID + entropy tables) */
+size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+                   const void* const dict, size_t const dictSize);
+
+/*! ZSTD_checkContinuity() :
+ *  check if next `dst` follows previous position, where decompression ended.
+ *  If yes, do nothing (continue on current segment).
+ *  If not, classify previous segment as "external dictionary", and start a new segment.
+ *  This function cannot fail. */
+void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize);
+
+
+#endif /* ZSTD_DECOMPRESS_INTERNAL_H */
--- a/third_party/zstd/lib/deprecated/zbuff.h
+++ b/third_party/zstd/lib/deprecated/zbuff.h
@ -0,0 +1,215 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* ***************************************************************
+*  NOTES/WARNINGS
+******************************************************************/
+/* The streaming API defined here is deprecated.
+ * Consider migrating towards ZSTD_compressStream() API in `zstd.h`
+ * See 'lib/README.md'.
+ *****************************************************************/
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_BUFFERED_H_23987
+#define ZSTD_BUFFERED_H_23987
+
+/* *************************************
+*  Dependencies
+***************************************/
+      /* size_t */
+#include "third_party/zstd/zstd.h"        /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
+
+
+/* ***************************************************************
+*  Compiler specifics
+*****************************************************************/
+/* Deprecation warnings */
+/* Should these warnings be a problem,
+ * it is generally possible to disable them,
+ * typically with -Wno-deprecated-declarations for gcc
+ * or _CRT_SECURE_NO_WARNINGS in Visual.
+ * Otherwise, it's also possible to define ZBUFF_DISABLE_DEPRECATE_WARNINGS
+ */
+#ifdef ZBUFF_DISABLE_DEPRECATE_WARNINGS
+#  define ZBUFF_DEPRECATED(message) ZSTDLIB_API  /* disable deprecation warnings */
+#else
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define ZBUFF_DEPRECATED(message) [[deprecated(message)]] ZSTDLIB_API
+#  elif (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated(message)))
+#  elif defined(__GNUC__) && (__GNUC__ >= 3)
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement ZBUFF_DEPRECATED for this compiler")
+#    define ZBUFF_DEPRECATED(message) ZSTDLIB_API
+#  endif
+#endif /* ZBUFF_DISABLE_DEPRECATE_WARNINGS */
+
+
+/* *************************************
+*  Streaming functions
+***************************************/
+/* This is the easier "buffered" streaming API,
+*  using an internal buffer to lift all restrictions on user-provided buffers
+*  which can be any size, any place, for both input and output.
+*  ZBUFF and ZSTD are 100% interoperable,
+*  frames created by one can be decoded by the other one */
+
+typedef ZSTD_CStream ZBUFF_CCtx;
+ZBUFF_DEPRECATED("use ZSTD_createCStream") ZBUFF_CCtx* ZBUFF_createCCtx(void);
+ZBUFF_DEPRECATED("use ZSTD_freeCStream")   size_t      ZBUFF_freeCCtx(ZBUFF_CCtx* cctx);
+
+ZBUFF_DEPRECATED("use ZSTD_initCStream")           size_t ZBUFF_compressInit(ZBUFF_CCtx* cctx, int compressionLevel);
+ZBUFF_DEPRECATED("use ZSTD_initCStream_usingDict") size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+
+ZBUFF_DEPRECATED("use ZSTD_compressStream") size_t ZBUFF_compressContinue(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr, const void* src, size_t* srcSizePtr);
+ZBUFF_DEPRECATED("use ZSTD_flushStream")    size_t ZBUFF_compressFlush(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr);
+ZBUFF_DEPRECATED("use ZSTD_endStream")      size_t ZBUFF_compressEnd(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr);
+
+/*-*************************************************
+*  Streaming compression - howto
+*
+*  A ZBUFF_CCtx object is required to track streaming operation.
+*  Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources.
+*  ZBUFF_CCtx objects can be reused multiple times.
+*
+*  Start by initializing ZBUF_CCtx.
+*  Use ZBUFF_compressInit() to start a new compression operation.
+*  Use ZBUFF_compressInitDictionary() for a compression which requires a dictionary.
+*
+*  Use ZBUFF_compressContinue() repetitively to consume input stream.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written within *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present again remaining data.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each call, so save its content if it matters or change @dst .
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's just a hint, to improve latency)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  At any moment, it's possible to flush whatever data remains within buffer, using ZBUFF_compressFlush().
+*  The nb of bytes written into `dst` will be reported into *dstCapacityPtr.
+*  Note that the function cannot output more than *dstCapacityPtr,
+*  therefore, some content might still be left into internal buffer if *dstCapacityPtr is too small.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  ZBUFF_compressEnd() instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small.
+*  In which case, call again ZBUFF_compressFlush() to complete the flush.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : _recommended buffer_ sizes (not compulsory) : ZBUFF_recommendedCInSize() / ZBUFF_recommendedCOutSize()
+*  input : ZBUFF_recommendedCInSize==128 KB block size is the internal unit, use this value to reduce intermediate stages (better latency)
+*  output : ZBUFF_recommendedCOutSize==ZSTD_compressBound(128 KB) + 3 + 3 : ensures it's always possible to write/flush/end a full block. Skip some buffering.
+*  By using both, it ensures that input will be entirely consumed, and output will always contain the result, reducing intermediate buffering.
+* **************************************************/
+
+
+typedef ZSTD_DStream ZBUFF_DCtx;
+ZBUFF_DEPRECATED("use ZSTD_createDStream") ZBUFF_DCtx* ZBUFF_createDCtx(void);
+ZBUFF_DEPRECATED("use ZSTD_freeDStream")   size_t      ZBUFF_freeDCtx(ZBUFF_DCtx* dctx);
+
+ZBUFF_DEPRECATED("use ZSTD_initDStream")           size_t ZBUFF_decompressInit(ZBUFF_DCtx* dctx);
+ZBUFF_DEPRECATED("use ZSTD_initDStream_usingDict") size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* dctx, const void* dict, size_t dictSize);
+
+ZBUFF_DEPRECATED("use ZSTD_decompressStream") size_t ZBUFF_decompressContinue(ZBUFF_DCtx* dctx,
+                                            void* dst, size_t* dstCapacityPtr,
+                                      const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFF_DCtx object is required to track streaming operations.
+*  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
+*  Use ZBUFF_decompressInit() to start a new decompression operation,
+*   or ZBUFF_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFF_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFF_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`.
+*  @return : 0 when a frame is completely decoded and fully flushed,
+*            1 when there is still some data left within internal buffer to flush,
+*            >1 when more data is expected, with value being a suggested next input size (it's just a hint, which helps latency),
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize() and ZBUFF_recommendedDOutSize()
+*  output : ZBUFF_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFF_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+ZBUFF_DEPRECATED("use ZSTD_isError")      unsigned ZBUFF_isError(size_t errorCode);
+ZBUFF_DEPRECATED("use ZSTD_getErrorName") const char* ZBUFF_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, they tend to offer better latency */
+ZBUFF_DEPRECATED("use ZSTD_CStreamInSize")  size_t ZBUFF_recommendedCInSize(void);
+ZBUFF_DEPRECATED("use ZSTD_CStreamOutSize") size_t ZBUFF_recommendedCOutSize(void);
+ZBUFF_DEPRECATED("use ZSTD_DStreamInSize")  size_t ZBUFF_recommendedDInSize(void);
+ZBUFF_DEPRECATED("use ZSTD_DStreamOutSize") size_t ZBUFF_recommendedDOutSize(void);
+
+#endif  /* ZSTD_BUFFERED_H_23987 */
+
+
+#ifdef ZBUFF_STATIC_LINKING_ONLY
+#ifndef ZBUFF_STATIC_H_30298098432
+#define ZBUFF_STATIC_H_30298098432
+
+/* ====================================================================================
+ * The definitions in this section are considered experimental.
+ * They should never be used in association with a dynamic library, as they may change in the future.
+ * They are provided for advanced usages.
+ * Use them only in association with static linking.
+ * ==================================================================================== */
+
+/*--- Dependency ---*/
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_parameters, ZSTD_customMem */
+#include "third_party/zstd/zstd.h"
+
+
+/*--- Custom memory allocator ---*/
+/*! ZBUFF_createCCtx_advanced() :
+ *  Create a ZBUFF compression context using external alloc and free functions */
+ZBUFF_DEPRECATED("use ZSTD_createCStream_advanced") ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem);
+
+/*! ZBUFF_createDCtx_advanced() :
+ *  Create a ZBUFF decompression context using external alloc and free functions */
+ZBUFF_DEPRECATED("use ZSTD_createDStream_advanced") ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem);
+
+
+/*--- Advanced Streaming Initialization ---*/
+ZBUFF_DEPRECATED("use ZSTD_initDStream_usingDict") size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc,
+                                               const void* dict, size_t dictSize,
+                                               ZSTD_parameters params, unsigned long long pledgedSrcSize);
+
+
+#endif    /* ZBUFF_STATIC_H_30298098432 */
+#endif    /* ZBUFF_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
--- a/third_party/zstd/lib/deprecated/zbuff_common.c
+++ b/third_party/zstd/lib/deprecated/zbuff_common.c
@ -0,0 +1,27 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "third_party/zstd/lib/common/error_private.h"
+#include "third_party/zstd/lib/deprecated/zbuff.h"
+
+/*-****************************************
+*  ZBUFF Error Management  (deprecated)
+******************************************/
+
+/*! ZBUFF_isError() :
+*   tells if a return value is an error code */
+unsigned ZBUFF_isError(size_t errorCode) { return ERR_isError(errorCode); }
+/*! ZBUFF_getErrorName() :
+*   provides error code string from function result (useful for debugging) */
+const char* ZBUFF_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
--- a/third_party/zstd/lib/deprecated/zbuff_compress.c
+++ b/third_party/zstd/lib/deprecated/zbuff_compress.c
@ -0,0 +1,168 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/* *************************************
+*  Dependencies
+***************************************/
+#define ZBUFF_STATIC_LINKING_ONLY
+#include "third_party/zstd/lib/deprecated/zbuff.h"
+#include "third_party/zstd/lib/common/error_private.h"
+
+
+/*-***********************************************************
+*  Streaming compression
+*
+*  A ZBUFF_CCtx object is required to track streaming operation.
+*  Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources.
+*  Use ZBUFF_compressInit() to start a new compression operation.
+*  ZBUFF_CCtx objects can be reused multiple times.
+*
+*  Use ZBUFF_compressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input.
+*  The content of dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change dst .
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  ZBUFF_compressFlush() can be used to instruct ZBUFF to compress and output whatever remains within its buffer.
+*  Note that it will not output more than *dstCapacityPtr.
+*  Therefore, some content might still be left into its internal buffer if dst buffer is too small.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  ZBUFF_compressEnd() instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small.
+*  @return : nb of bytes still present into internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory)
+*  input : ZSTD_BLOCKSIZE_MAX (128 KB), internal unit size, it improves latency to use this value.
+*  output : ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + ZBUFF_endFrameSize : ensures it's always possible to write/flush/end a full block at best speed.
+* ***********************************************************/
+
+ZBUFF_CCtx* ZBUFF_createCCtx(void)
+{
+    return ZSTD_createCStream();
+}
+
+ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createCStream_advanced(customMem);
+}
+
+size_t ZBUFF_freeCCtx(ZBUFF_CCtx* zbc)
+{
+    return ZSTD_freeCStream(zbc);
+}
+
+
+/* ======   Initialization   ====== */
+
+size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc,
+                                   const void* dict, size_t dictSize,
+                                   ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* preserve "0 == unknown" behavior */
+    FORWARD_IF_ERROR(ZSTD_CCtx_reset(zbc, ZSTD_reset_session_only), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setPledgedSrcSize(zbc, pledgedSrcSize), "");
+
+    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_windowLog, params.cParams.windowLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_hashLog, params.cParams.hashLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_chainLog, params.cParams.chainLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_searchLog, params.cParams.searchLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_minMatch, params.cParams.minMatch), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_targetLength, params.cParams.targetLength), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_strategy, params.cParams.strategy), "");
+
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_contentSizeFlag, params.fParams.contentSizeFlag), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_checksumFlag, params.fParams.checksumFlag), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_dictIDFlag, params.fParams.noDictIDFlag), "");
+
+    FORWARD_IF_ERROR(ZSTD_CCtx_loadDictionary(zbc, dict, dictSize), "");
+    return 0;
+}
+
+size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* zbc, const void* dict, size_t dictSize, int compressionLevel)
+{
+    FORWARD_IF_ERROR(ZSTD_CCtx_reset(zbc, ZSTD_reset_session_only), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(zbc, ZSTD_c_compressionLevel, compressionLevel), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_loadDictionary(zbc, dict, dictSize), "");
+    return 0;
+}
+
+size_t ZBUFF_compressInit(ZBUFF_CCtx* zbc, int compressionLevel)
+{
+    return ZSTD_initCStream(zbc, compressionLevel);
+}
+
+/* ======   Compression   ====== */
+
+
+size_t ZBUFF_compressContinue(ZBUFF_CCtx* zbc,
+                              void* dst, size_t* dstCapacityPtr,
+                        const void* src, size_t* srcSizePtr)
+{
+    size_t result;
+    ZSTD_outBuffer outBuff;
+    ZSTD_inBuffer inBuff;
+    outBuff.dst = dst;
+    outBuff.pos = 0;
+    outBuff.size = *dstCapacityPtr;
+    inBuff.src = src;
+    inBuff.pos = 0;
+    inBuff.size = *srcSizePtr;
+    result = ZSTD_compressStream(zbc, &outBuff, &inBuff);
+    *dstCapacityPtr = outBuff.pos;
+    *srcSizePtr = inBuff.pos;
+    return result;
+}
+
+
+
+/* ======   Finalize   ====== */
+
+size_t ZBUFF_compressFlush(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr)
+{
+    size_t result;
+    ZSTD_outBuffer outBuff;
+    outBuff.dst = dst;
+    outBuff.pos = 0;
+    outBuff.size = *dstCapacityPtr;
+    result = ZSTD_flushStream(zbc, &outBuff);
+    *dstCapacityPtr = outBuff.pos;
+    return result;
+}
+
+
+size_t ZBUFF_compressEnd(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr)
+{
+    size_t result;
+    ZSTD_outBuffer outBuff;
+    outBuff.dst = dst;
+    outBuff.pos = 0;
+    outBuff.size = *dstCapacityPtr;
+    result = ZSTD_endStream(zbc, &outBuff);
+    *dstCapacityPtr = outBuff.pos;
+    return result;
+}
+
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+size_t ZBUFF_recommendedCInSize(void)  { return ZSTD_CStreamInSize(); }
+size_t ZBUFF_recommendedCOutSize(void) { return ZSTD_CStreamOutSize(); }
--- a/third_party/zstd/lib/deprecated/zbuff_decompress.c
+++ b/third_party/zstd/lib/deprecated/zbuff_decompress.c
@ -0,0 +1,78 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/* *************************************
+*  Dependencies
+***************************************/
+#define ZSTD_DISABLE_DEPRECATE_WARNINGS  /* suppress warning on ZSTD_initDStream_usingDict */
+#include "third_party/zstd/zstd.h"        /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */
+#define ZBUFF_STATIC_LINKING_ONLY
+#include "third_party/zstd/lib/deprecated/zbuff.h"
+
+
+ZBUFF_DCtx* ZBUFF_createDCtx(void)
+{
+    return ZSTD_createDStream();
+}
+
+ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createDStream_advanced(customMem);
+}
+
+size_t ZBUFF_freeDCtx(ZBUFF_DCtx* zbd)
+{
+    return ZSTD_freeDStream(zbd);
+}
+
+
+/* *** Initialization *** */
+
+size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* zbd, const void* dict, size_t dictSize)
+{
+    return ZSTD_initDStream_usingDict(zbd, dict, dictSize);
+}
+
+size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbd)
+{
+    return ZSTD_initDStream(zbd);
+}
+
+
+/* *** Decompression *** */
+
+size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbd,
+                                void* dst, size_t* dstCapacityPtr,
+                          const void* src, size_t* srcSizePtr)
+{
+    ZSTD_outBuffer outBuff;
+    ZSTD_inBuffer inBuff;
+    size_t result;
+    outBuff.dst  = dst;
+    outBuff.pos  = 0;
+    outBuff.size = *dstCapacityPtr;
+    inBuff.src  = src;
+    inBuff.pos  = 0;
+    inBuff.size = *srcSizePtr;
+    result = ZSTD_decompressStream(zbd, &outBuff, &inBuff);
+    *dstCapacityPtr = outBuff.pos;
+    *srcSizePtr = inBuff.pos;
+    return result;
+}
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+size_t ZBUFF_recommendedDInSize(void)  { return ZSTD_DStreamInSize(); }
+size_t ZBUFF_recommendedDOutSize(void) { return ZSTD_DStreamOutSize(); }
--- a/third_party/zstd/lib/dictBuilder/cover.c
+++ b/third_party/zstd/lib/dictBuilder/cover.c
--- a/third_party/zstd/lib/dictBuilder/cover.h
+++ b/third_party/zstd/lib/dictBuilder/cover.h
@ -0,0 +1,190 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#  define ZDICT_STATIC_LINKING_ONLY
+#endif
+
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"  /* fprintf */
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h" /* malloc, free, qsort */
+#include "libc/mem/alg.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h" /* memset */
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/struct/timeval.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/sysv/consts/sched.h"
+#include "libc/sysv/consts/timer.h"
+#include "libc/time/struct/tm.h"
+#include "libc/time/time.h"   /* clock */
+#include "third_party/zstd/lib/common/mem.h" /* read */
+#include "third_party/zstd/lib/common/pool.h"
+#include "third_party/zstd/lib/common/threading.h"
+#include "third_party/zstd/lib/common/zstd_internal.h" /* includes zstd.h */
+#include "third_party/zstd/zdict.h"
+
+/**
+ * COVER_best_t is used for two purposes:
+ * 1. Synchronizing threads.
+ * 2. Saving the best parameters and dictionary.
+ *
+ * All of the methods except COVER_best_init() are thread safe if zstd is
+ * compiled with multithreaded support.
+ */
+typedef struct COVER_best_s {
+  ZSTD_pthread_mutex_t mutex;
+  ZSTD_pthread_cond_t cond;
+  size_t liveJobs;
+  void *dict;
+  size_t dictSize;
+  ZDICT_cover_params_t parameters;
+  size_t compressedSize;
+} COVER_best_t;
+
+/**
+ * A segment is a range in the source as well as the score of the segment.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+  U32 score;
+} COVER_segment_t;
+
+/**
+ *Number of epochs and size of each epoch.
+ */
+typedef struct {
+  U32 num;
+  U32 size;
+} COVER_epoch_info_t;
+
+/**
+ * Struct used for the dictionary selection function.
+ */
+typedef struct COVER_dictSelection {
+  BYTE* dictContent;
+  size_t dictSize;
+  size_t totalCompressedSize;
+} COVER_dictSelection_t;
+
+/**
+ * Computes the number of epochs and the size of each epoch.
+ * We will make sure that each epoch gets at least 10 * k bytes.
+ *
+ * The COVER algorithms divide the data up into epochs of equal size and
+ * select one segment from each epoch.
+ *
+ * @param maxDictSize The maximum allowed dictionary size.
+ * @param nbDmers     The number of dmers we are training on.
+ * @param k           The parameter k (segment size).
+ * @param passes      The target number of passes over the dmer corpus.
+ *                    More passes means a better dictionary.
+ */
+COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
+                                       U32 k, U32 passes);
+
+/**
+ * Warns the user when their corpus is too small.
+ */
+void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
+
+/**
+ *  Checks total compressed size of a dictionary
+ */
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                      const size_t *samplesSizes, const BYTE *samples,
+                                      size_t *offsets,
+                                      size_t nbTrainSamples, size_t nbSamples,
+                                      BYTE *const dict, size_t dictBufferCapacity);
+
+/**
+ * Returns the sum of the sample sizes.
+ */
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
+
+/**
+ * Initialize the `COVER_best_t`.
+ */
+void COVER_best_init(COVER_best_t *best);
+
+/**
+ * Wait until liveJobs == 0.
+ */
+void COVER_best_wait(COVER_best_t *best);
+
+/**
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
+ */
+void COVER_best_destroy(COVER_best_t *best);
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+void COVER_best_start(COVER_best_t *best);
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
+                       COVER_dictSelection_t selection);
+/**
+ * Error function for COVER_selectDict function. Checks if the return
+ * value is an error.
+ */
+unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
+
+ /**
+  * Error function for COVER_selectDict function. Returns a struct where
+  * return.totalCompressedSize is a ZSTD error.
+  */
+COVER_dictSelection_t COVER_dictSelectionError(size_t error);
+
+/**
+ * Always call after selectDict is called to free up used memory from
+ * newly created dictionary.
+ */
+void COVER_dictSelectionFree(COVER_dictSelection_t selection);
+
+/**
+ * Called to finalize the dictionary and select one based on whether or not
+ * the shrink-dict flag was enabled. If enabled the dictionary used is the
+ * smallest dictionary within a specified regression of the compressed size
+ * from the largest dictionary.
+ */
+ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
+                       size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
+                       size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
--- a/third_party/zstd/lib/dictBuilder/divsufsort.c
+++ b/third_party/zstd/lib/dictBuilder/divsufsort.c
--- a/third_party/zstd/lib/dictBuilder/divsufsort.h
+++ b/third_party/zstd/lib/dictBuilder/divsufsort.h
@ -0,0 +1,68 @@
+// clang-format off
+/*
+ * divsufsort.h for libdivsufsort-lite
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DIVSUFSORT_H
+#define _DIVSUFSORT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/*- Prototypes -*/
+
+/**
+ * Constructs the suffix array of a given string.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1] The output array of suffixes.
+ * @param n The length of the given string.
+ * @param openMP enables OpenMP optimization.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int
+divsufsort(const unsigned char *T, int *SA, int n, int openMP);
+
+/**
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string. (can be T)
+ * @param A [0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @param num_indexes The length of secondary indexes array. (can be NULL)
+ * @param indexes The secondary indexes array. (can be NULL)
+ * @param openMP enables OpenMP optimization.
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+int
+divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP);
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _DIVSUFSORT_H */
--- a/third_party/zstd/lib/dictBuilder/fastcover.c
+++ b/third_party/zstd/lib/dictBuilder/fastcover.c
@ -0,0 +1,798 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"  /* fprintf */
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h" /* malloc, free, qsort */
+#include "libc/mem/alg.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h" /* memset */
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/struct/timeval.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/sysv/consts/sched.h"
+#include "libc/sysv/consts/timer.h"
+#include "libc/time/struct/tm.h"
+#include "libc/time/time.h"   /* clock */
+
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#  define ZDICT_STATIC_LINKING_ONLY
+#endif
+
+#include "third_party/zstd/lib/common/mem.h" /* read */
+#include "third_party/zstd/lib/common/pool.h"
+#include "third_party/zstd/lib/common/threading.h"
+#include "third_party/zstd/lib/common/zstd_internal.h" /* includes zstd.h */
+#include "third_party/zstd/lib/compress/zstd_compress_internal.h" /* ZSTD_hash*() */
+#include "third_party/zstd/zdict.h"
+#include "third_party/zstd/lib/dictBuilder/cover.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+/**
+* There are 32bit indexes used to ref samples, so limit samples size to 4GB
+* on 64bit builds.
+* For 32bit builds we choose 1 GB.
+* Most 32bit platforms have 2GB user-mode addressable space and we allocate a large
+* contiguous buffer, so 1GB is already a high limit.
+*/
+#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
+#define FASTCOVER_MAX_F 31
+#define FASTCOVER_MAX_ACCEL 10
+#define FASTCOVER_DEFAULT_SPLITPOINT 0.75
+#define DEFAULT_F 20
+#define DEFAULT_ACCEL 1
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#ifndef LOCALDISPLAYLEVEL
+static int g_displayLevel = 0;
+#endif
+#undef  DISPLAY
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#undef  LOCALDISPLAYLEVEL
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#undef  DISPLAYLEVEL
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#ifndef LOCALDISPLAYUPDATE
+static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+#endif
+#undef  LOCALDISPLAYUPDATE
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#undef  DISPLAYUPDATE
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+
+
+/*-*************************************
+* Hash Functions
+***************************************/
+/**
+ * Hash the d-byte value pointed to by p and mod 2^f into the frequency vector
+ */
+static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 f, unsigned d) {
+  if (d == 6) {
+    return ZSTD_hash6Ptr(p, f);
+  }
+  return ZSTD_hash8Ptr(p, f);
+}
+
+
+/*-*************************************
+* Acceleration
+***************************************/
+typedef struct {
+  unsigned finalize;    /* Percentage of training samples used for ZDICT_finalizeDictionary */
+  unsigned skip;        /* Number of dmer skipped between each dmer counted in computeFrequency */
+} FASTCOVER_accel_t;
+
+
+static const FASTCOVER_accel_t FASTCOVER_defaultAccelParameters[FASTCOVER_MAX_ACCEL+1] = {
+  { 100, 0 },   /* accel = 0, should not happen because accel = 0 defaults to accel = 1 */
+  { 100, 0 },   /* accel = 1 */
+  { 50, 1 },   /* accel = 2 */
+  { 34, 2 },   /* accel = 3 */
+  { 25, 3 },   /* accel = 4 */
+  { 20, 4 },   /* accel = 5 */
+  { 17, 5 },   /* accel = 6 */
+  { 14, 6 },   /* accel = 7 */
+  { 13, 7 },   /* accel = 8 */
+  { 11, 8 },   /* accel = 9 */
+  { 10, 9 },   /* accel = 10 */
+};
+
+
+/*-*************************************
+* Context
+***************************************/
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
+  size_t nbDmers;
+  U32 *freqs;
+  unsigned d;
+  unsigned f;
+  FASTCOVER_accel_t accelParams;
+} FASTCOVER_ctx_t;
+
+
+/*-*************************************
+*  Helper functions
+***************************************/
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of all dmers with hash value d.
+ * Let S_i be hash value of the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer with hash value d is in the dictionary we set F(d) = 0.
+ */
+static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
+                                              U32 *freqs, U32 begin, U32 end,
+                                              ZDICT_cover_params_t parameters,
+                                              U16* segmentFreqs) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 f = ctx->f;
+  const U32 dmersInK = k - d + 1;
+
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  COVER_segment_t bestSegment = {0, 0, 0};
+  COVER_segment_t activeSegment;
+
+  /* Reset the activeDmers in the segment */
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+
+  /* Slide the activeSegment through the whole epoch.
+   * Save the best segment in bestSegment.
+   */
+  while (activeSegment.end < end) {
+    /* Get hash value of current dmer */
+    const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
+
+    /* Add frequency of this index to score if this is the first occurrence of index in active segment */
+    if (segmentFreqs[idx] == 0) {
+      activeSegment.score += freqs[idx];
+    }
+    /* Increment end of segment and segmentFreqs*/
+    activeSegment.end += 1;
+    segmentFreqs[idx] += 1;
+    /* If the window is now too large, drop the first position */
+    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+      /* Get hash value of the dmer to be eliminated from active segment */
+      const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+      segmentFreqs[delIndex] -= 1;
+      /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
+      if (segmentFreqs[delIndex] == 0) {
+        activeSegment.score -= freqs[delIndex];
+      }
+      /* Increment start of segment */
+      activeSegment.begin += 1;
+    }
+
+    /* If this segment is the best so far save it */
+    if (activeSegment.score > bestSegment.score) {
+      bestSegment = activeSegment;
+    }
+  }
+
+  /* Zero out rest of segmentFreqs array */
+  while (activeSegment.begin < end) {
+    const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+    segmentFreqs[delIndex] -= 1;
+    activeSegment.begin += 1;
+  }
+
+  {
+    /*  Zero the frequency of hash value of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d);
+      freqs[i] = 0;
+    }
+  }
+
+  return bestSegment;
+}
+
+
+static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters,
+                                     size_t maxDictSize, unsigned f,
+                                     unsigned accel) {
+  /* k, d, and f are required parameters */
+  if (parameters.d == 0 || parameters.k == 0) {
+    return 0;
+  }
+  /* d has to be 6 or 8 */
+  if (parameters.d != 6 && parameters.d != 8) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  /* 0 < f <= FASTCOVER_MAX_F*/
+  if (f > FASTCOVER_MAX_F || f == 0) {
+    return 0;
+  }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
+    return 0;
+  }
+  /* 0 < accel <= 10 */
+  if (accel > 10 || accel == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+
+/**
+ * Clean up a context initialized with `FASTCOVER_ctx_init()`.
+ */
+static void
+FASTCOVER_ctx_destroy(FASTCOVER_ctx_t* ctx)
+{
+    if (!ctx) return;
+
+    free(ctx->freqs);
+    ctx->freqs = NULL;
+
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+}
+
+
+/**
+ * Calculate for frequency of hash value of each dmer in ctx->samples
+ */
+static void
+FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
+{
+    const unsigned f = ctx->f;
+    const unsigned d = ctx->d;
+    const unsigned skip = ctx->accelParams.skip;
+    const unsigned readLength = MAX(d, 8);
+    size_t i;
+    assert(ctx->nbTrainSamples >= 5);
+    assert(ctx->nbTrainSamples <= ctx->nbSamples);
+    for (i = 0; i < ctx->nbTrainSamples; i++) {
+        size_t start = ctx->offsets[i];  /* start of current dmer */
+        size_t const currSampleEnd = ctx->offsets[i+1];
+        while (start + readLength <= currSampleEnd) {
+            const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, d);
+            freqs[dmerIndex]++;
+            start = start + skip + 1;
+        }
+    }
+}
+
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can be used multiple
+ * times.
+ * Returns 0 on success or error code on error.
+ * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
+ */
+static size_t
+FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
+                   const void* samplesBuffer,
+                   const size_t* samplesSizes, unsigned nbSamples,
+                   unsigned d, double splitPoint, unsigned f,
+                   FASTCOVER_accel_t accelParams)
+{
+    const BYTE* const samples = (const BYTE*)samplesBuffer;
+    const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+    /* Split samples into testing and training sets */
+    const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+    const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+    const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+    const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
+
+    /* Checks */
+    if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+        totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
+        DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                    (unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
+        return ERROR(srcSize_wrong);
+    }
+
+    /* Check if there are at least 5 training samples */
+    if (nbTrainSamples < 5) {
+        DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
+        return ERROR(srcSize_wrong);
+    }
+
+    /* Check if there's testing sample */
+    if (nbTestSamples < 1) {
+        DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
+        return ERROR(srcSize_wrong);
+    }
+
+    /* Zero the context */
+    memset(ctx, 0, sizeof(*ctx));
+    DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+                    (unsigned)trainingSamplesSize);
+    DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+                    (unsigned)testSamplesSize);
+
+    ctx->samples = samples;
+    ctx->samplesSizes = samplesSizes;
+    ctx->nbSamples = nbSamples;
+    ctx->nbTrainSamples = nbTrainSamples;
+    ctx->nbTestSamples = nbTestSamples;
+    ctx->nbDmers = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
+    ctx->d = d;
+    ctx->f = f;
+    ctx->accelParams = accelParams;
+
+    /* The offsets of each file */
+    ctx->offsets = (size_t*)calloc((nbSamples + 1), sizeof(size_t));
+    if (ctx->offsets == NULL) {
+        DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
+        FASTCOVER_ctx_destroy(ctx);
+        return ERROR(memory_allocation);
+    }
+
+    /* Fill offsets from the samplesSizes */
+    {   U32 i;
+        ctx->offsets[0] = 0;
+        assert(nbSamples >= 5);
+        for (i = 1; i <= nbSamples; ++i) {
+            ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+        }
+    }
+
+    /* Initialize frequency array of size 2^f */
+    ctx->freqs = (U32*)calloc(((U64)1 << f), sizeof(U32));
+    if (ctx->freqs == NULL) {
+        DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
+        FASTCOVER_ctx_destroy(ctx);
+        return ERROR(memory_allocation);
+    }
+
+    DISPLAYLEVEL(2, "Computing frequencies\n");
+    FASTCOVER_computeFrequency(ctx->freqs, ctx);
+
+    return 0;
+}
+
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t
+FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
+                          U32* freqs,
+                          void* dictBuffer, size_t dictBufferCapacity,
+                          ZDICT_cover_params_t parameters,
+                          U16* segmentFreqs)
+{
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data into epochs. We will select one segment from each epoch. */
+  const COVER_epoch_info_t epochs = COVER_computeEpochs(
+      (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
+  const size_t maxZeroScoreRun = 10;
+  size_t zeroScoreRun = 0;
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
+                (U32)epochs.num, (U32)epochs.size);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
+    const U32 epochBegin = (U32)(epoch * epochs.size);
+    const U32 epochEnd = epochBegin + epochs.size;
+    size_t segmentSize;
+    /* Select a segment */
+    COVER_segment_t segment = FASTCOVER_selectSegment(
+        ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
+
+    /* If the segment covers no dmers, then we are out of content.
+     * There may be new content in other epochs, for continue for some time.
+     */
+    if (segment.score == 0) {
+      if (++zeroScoreRun >= maxZeroScoreRun) {
+          break;
+      }
+      continue;
+    }
+    zeroScoreRun = 0;
+
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+/**
+ * Parameters for FASTCOVER_tryParameters().
+ */
+typedef struct FASTCOVER_tryParameters_data_s {
+    const FASTCOVER_ctx_t* ctx;
+    COVER_best_t* best;
+    size_t dictBufferCapacity;
+    ZDICT_cover_params_t parameters;
+} FASTCOVER_tryParameters_data_t;
+
+
+/**
+ * Tries a set of parameters and updates the COVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void FASTCOVER_tryParameters(void* opaque)
+{
+  /* Save parameters as local variables */
+  FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t*)opaque;
+  const FASTCOVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_cover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Initialize array to keep track of frequency of dmer within activeSegment */
+  U16* segmentFreqs = (U16*)calloc(((U64)1 << ctx->f), sizeof(U16));
+  /* Allocate space for hash table, dict, and freqs */
+  BYTE *const dict = (BYTE*)malloc(dictBufferCapacity);
+  COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
+  U32* freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
+  if (!segmentFreqs || !dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
+  /* Build the dictionary */
+  { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
+                                                    parameters, segmentFreqs);
+
+    const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
+    selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
+         ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
+         totalCompressedSize);
+
+    if (COVER_dictSelectionIsError(selection)) {
+      DISPLAYLEVEL(1, "Failed to select dictionary\n");
+      goto _cleanup;
+    }
+  }
+_cleanup:
+  free(dict);
+  COVER_best_finish(data->best, parameters, selection);
+  free(data);
+  free(segmentFreqs);
+  COVER_dictSelectionFree(selection);
+  free(freqs);
+}
+
+
+static void
+FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
+                               ZDICT_cover_params_t* coverParams)
+{
+    coverParams->k = fastCoverParams.k;
+    coverParams->d = fastCoverParams.d;
+    coverParams->steps = fastCoverParams.steps;
+    coverParams->nbThreads = fastCoverParams.nbThreads;
+    coverParams->splitPoint = fastCoverParams.splitPoint;
+    coverParams->zParams = fastCoverParams.zParams;
+    coverParams->shrinkDict = fastCoverParams.shrinkDict;
+}
+
+
+static void
+FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
+                                   ZDICT_fastCover_params_t* fastCoverParams,
+                                   unsigned f, unsigned accel)
+{
+    fastCoverParams->k = coverParams.k;
+    fastCoverParams->d = coverParams.d;
+    fastCoverParams->steps = coverParams.steps;
+    fastCoverParams->nbThreads = coverParams.nbThreads;
+    fastCoverParams->splitPoint = coverParams.splitPoint;
+    fastCoverParams->f = f;
+    fastCoverParams->accel = accel;
+    fastCoverParams->zParams = coverParams.zParams;
+    fastCoverParams->shrinkDict = coverParams.shrinkDict;
+}
+
+
+ZDICTLIB_API size_t
+ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
+                                const void* samplesBuffer,
+                                const size_t* samplesSizes, unsigned nbSamples,
+                                ZDICT_fastCover_params_t parameters)
+{
+    BYTE* const dict = (BYTE*)dictBuffer;
+    FASTCOVER_ctx_t ctx;
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* Initialize global data */
+    g_displayLevel = (int)parameters.zParams.notificationLevel;
+    /* Assign splitPoint and f if not provided */
+    parameters.splitPoint = 1.0;
+    parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
+    parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel;
+    /* Convert to cover parameter */
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(parameters, &coverParams);
+    /* Checks */
+    if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
+                                   parameters.accel)) {
+      DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+      return ERROR(parameter_outOfBound);
+    }
+    if (nbSamples == 0) {
+      DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
+      return ERROR(srcSize_wrong);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    /* Assign corresponding FASTCOVER_accel_t to accelParams*/
+    accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
+    /* Initialize context */
+    {
+      size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                            coverParams.d, parameters.splitPoint, parameters.f,
+                            accelParams);
+      if (ZSTD_isError(initVal)) {
+        DISPLAYLEVEL(1, "Failed to initialize context\n");
+        return initVal;
+      }
+    }
+    COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
+    /* Build the dictionary */
+    DISPLAYLEVEL(2, "Building dictionary\n");
+    {
+      /* Initialize array to keep track of frequency of dmer within activeSegment */
+      U16* segmentFreqs = (U16 *)calloc(((U64)1 << parameters.f), sizeof(U16));
+      const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
+                                                dictBufferCapacity, coverParams, segmentFreqs);
+      const unsigned nbFinalizeSamples = (unsigned)(ctx.nbTrainSamples * ctx.accelParams.finalize / 100);
+      const size_t dictionarySize = ZDICT_finalizeDictionary(
+          dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+          samplesBuffer, samplesSizes, nbFinalizeSamples, coverParams.zParams);
+      if (!ZSTD_isError(dictionarySize)) {
+          DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                      (unsigned)dictionarySize);
+      }
+      FASTCOVER_ctx_destroy(&ctx);
+      free(segmentFreqs);
+      return dictionarySize;
+    }
+}
+
+
+ZDICTLIB_API size_t
+ZDICT_optimizeTrainFromBuffer_fastCover(
+                    void* dictBuffer, size_t dictBufferCapacity,
+                    const void* samplesBuffer,
+                    const size_t* samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t* parameters)
+{
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* constants */
+    const unsigned nbThreads = parameters->nbThreads;
+    const double splitPoint =
+        parameters->splitPoint <= 0.0 ? FASTCOVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
+    const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
+    const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+    const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+    const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+    const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+    const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+    const unsigned kIterations =
+        (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+    const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
+    const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
+    const unsigned shrinkDict = 0;
+    /* Local variables */
+    const int displayLevel = (int)parameters->zParams.notificationLevel;
+    unsigned iteration = 1;
+    unsigned d;
+    unsigned k;
+    COVER_best_t best;
+    POOL_ctx *pool = NULL;
+    int warned = 0;
+    /* Checks */
+    if (splitPoint <= 0 || splitPoint > 1) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
+      return ERROR(parameter_outOfBound);
+    }
+    if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
+      return ERROR(parameter_outOfBound);
+    }
+    if (kMinK < kMaxD || kMaxK < kMinK) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
+      return ERROR(parameter_outOfBound);
+    }
+    if (nbSamples == 0) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
+      return ERROR(srcSize_wrong);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    if (nbThreads > 1) {
+      pool = POOL_create(nbThreads, 1);
+      if (!pool) {
+        return ERROR(memory_allocation);
+      }
+    }
+    /* Initialization */
+    COVER_best_init(&best);
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(*parameters, &coverParams);
+    accelParams = FASTCOVER_defaultAccelParameters[accel];
+    /* Turn down global display level to clean up display at level 2 and below */
+    g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+    /* Loop through d first because each new value needs a new context */
+    LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                      kIterations);
+    for (d = kMinD; d <= kMaxD; d += 2) {
+      /* Initialize the context for this value of d */
+      FASTCOVER_ctx_t ctx;
+      LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+      {
+        size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams);
+        if (ZSTD_isError(initVal)) {
+          LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+          COVER_best_destroy(&best);
+          POOL_free(pool);
+          return initVal;
+        }
+      }
+      if (!warned) {
+        COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
+        warned = 1;
+      }
+      /* Loop through k reusing the same context */
+      for (k = kMinK; k <= kMaxK; k += kStepSize) {
+        /* Prepare the arguments */
+        FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
+            sizeof(FASTCOVER_tryParameters_data_t));
+        LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+        if (!data) {
+          LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+          COVER_best_destroy(&best);
+          FASTCOVER_ctx_destroy(&ctx);
+          POOL_free(pool);
+          return ERROR(memory_allocation);
+        }
+        data->ctx = &ctx;
+        data->best = &best;
+        data->dictBufferCapacity = dictBufferCapacity;
+        data->parameters = coverParams;
+        data->parameters.k = k;
+        data->parameters.d = d;
+        data->parameters.splitPoint = splitPoint;
+        data->parameters.steps = kSteps;
+        data->parameters.shrinkDict = shrinkDict;
+        data->parameters.zParams.notificationLevel = (unsigned)g_displayLevel;
+        /* Check the parameters */
+        if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
+                                       data->ctx->f, accel)) {
+          DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+          free(data);
+          continue;
+        }
+        /* Call the function and pass ownership of data to it */
+        COVER_best_start(&best);
+        if (pool) {
+          POOL_add(pool, &FASTCOVER_tryParameters, data);
+        } else {
+          FASTCOVER_tryParameters(data);
+        }
+        /* Print status */
+        LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                           (unsigned)((iteration * 100) / kIterations));
+        ++iteration;
+      }
+      COVER_best_wait(&best);
+      FASTCOVER_ctx_destroy(&ctx);
+    }
+    LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+    /* Fill the output buffer and parameters with output of the best parameters */
+    {
+      const size_t dictSize = best.dictSize;
+      if (ZSTD_isError(best.compressedSize)) {
+        const size_t compressedSize = best.compressedSize;
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return compressedSize;
+      }
+      FASTCOVER_convertToFastCoverParams(best.parameters, parameters, f, accel);
+      memcpy(dictBuffer, best.dict, dictSize);
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return dictSize;
+    }
+
+}
--- a/third_party/zstd/lib/dictBuilder/zdict.c
+++ b/third_party/zstd/lib/dictBuilder/zdict.c
--- a/third_party/zstd/programs/README.md
+++ b/third_party/zstd/programs/README.md
@ -0,0 +1,302 @@
+Command Line Interface for Zstandard library
+============================================
+
+Command Line Interface (CLI) can be created using the `make` command without any additional parameters.
+There are however other Makefile targets that create different variations of CLI:
+- `zstd` : default CLI supporting gzip-like arguments; includes dictionary builder, benchmark, and supports decompression of legacy zstd formats
+- `zstd_nolegacy` : Same as `zstd` but without support for legacy zstd formats
+- `zstd-small` : CLI optimized for minimal size; no dictionary builder, no benchmark, and no support for legacy zstd formats
+- `zstd-compress` : version of CLI which can only compress into zstd format
+- `zstd-decompress` : version of CLI which can only decompress zstd format
+
+
+### Compilation variables
+`zstd` scope can be altered by modifying the following `make` variables :
+
+- __HAVE_THREAD__ : multithreading is automatically enabled when `pthread` is detected.
+  It's possible to disable multithread support, by setting `HAVE_THREAD=0`.
+  Example : `make zstd HAVE_THREAD=0`
+  It's also possible to force multithread support, using `HAVE_THREAD=1`.
+  In which case, linking stage will fail if neither `pthread` nor `windows.h` library can be found.
+  This is useful to ensure this feature is not silently disabled.
+
+- __ZSTD_LEGACY_SUPPORT__ : `zstd` can decompress files compressed by older versions of `zstd`.
+  Starting v0.8.0, all versions of `zstd` produce frames compliant with the [specification](../doc/zstd_compression_format.md), and are therefore compatible.
+  But older versions (< v0.8.0) produced different, incompatible, frames.
+  By default, `zstd` supports decoding legacy formats >= v0.4.0 (`ZSTD_LEGACY_SUPPORT=4`).
+  This can be altered by modifying this compilation variable.
+  `ZSTD_LEGACY_SUPPORT=1` means "support all formats >= v0.1.0".
+  `ZSTD_LEGACY_SUPPORT=2` means "support all formats >= v0.2.0", and so on.
+  `ZSTD_LEGACY_SUPPORT=0` means _DO NOT_ support any legacy format.
+  if `ZSTD_LEGACY_SUPPORT >= 8`, it's the same as `0`, since there is no legacy format after `7`.
+  Note : `zstd` only supports decoding older formats, and cannot generate any legacy format.
+
+- __HAVE_ZLIB__ : `zstd` can compress and decompress files in `.gz` format.
+  This is ordered through command `--format=gzip`.
+  Alternatively, symlinks named `gzip` or `gunzip` will mimic intended behavior.
+  `.gz` support is automatically enabled when `zlib` library is detected at build time.
+  It's possible to disable `.gz` support, by setting `HAVE_ZLIB=0`.
+  Example : `make zstd HAVE_ZLIB=0`
+  It's also possible to force compilation with zlib support, using `HAVE_ZLIB=1`.
+  In which case, linking stage will fail if `zlib` library cannot be found.
+  This is useful to prevent silent feature disabling.
+
+- __HAVE_LZMA__ : `zstd` can compress and decompress files in `.xz` and `.lzma` formats.
+  This is ordered through commands `--format=xz` and `--format=lzma` respectively.
+  Alternatively, symlinks named `xz`, `unxz`, `lzma`, or `unlzma` will mimic intended behavior.
+  `.xz` and `.lzma` support is automatically enabled when `lzma` library is detected at build time.
+  It's possible to disable `.xz` and `.lzma` support, by setting `HAVE_LZMA=0`.
+  Example : `make zstd HAVE_LZMA=0`
+  It's also possible to force compilation with lzma support, using `HAVE_LZMA=1`.
+  In which case, linking stage will fail if `lzma` library cannot be found.
+  This is useful to prevent silent feature disabling.
+
+- __HAVE_LZ4__ : `zstd` can compress and decompress files in `.lz4` formats.
+  This is ordered through commands `--format=lz4`.
+  Alternatively, symlinks named `lz4`, or `unlz4` will mimic intended behavior.
+  `.lz4` support is automatically enabled when `lz4` library is detected at build time.
+  It's possible to disable `.lz4` support, by setting `HAVE_LZ4=0` .
+  Example : `make zstd HAVE_LZ4=0`
+  It's also possible to force compilation with lz4 support, using `HAVE_LZ4=1`.
+  In which case, linking stage will fail if `lz4` library cannot be found.
+  This is useful to prevent silent feature disabling.
+
+- __ZSTD_NOBENCH__ : `zstd` cli will be compiled without its integrated benchmark module.
+  This can be useful to produce smaller binaries.
+  In this case, the corresponding unit can also be excluded from compilation target.
+
+- __ZSTD_NODICT__ : `zstd` cli will be compiled without support for the integrated dictionary builder.
+  This can be useful to produce smaller binaries.
+  In this case, the corresponding unit can also be excluded from compilation target.
+
+- __ZSTD_NOCOMPRESS__ : `zstd` cli will be compiled without support for compression.
+  The resulting binary will only be able to decompress files.
+  This can be useful to produce smaller binaries.
+  A corresponding `Makefile` target using this ability is `zstd-decompress`.
+
+- __ZSTD_NODECOMPRESS__ : `zstd` cli will be compiled without support for decompression.
+  The resulting binary will only be able to compress files.
+  This can be useful to produce smaller binaries.
+  A corresponding `Makefile` target using this ability is `zstd-compress`.
+
+- __BACKTRACE__ : `zstd` can display a stack backtrace when execution
+  generates a runtime exception. By default, this feature may be
+  degraded/disabled on some platforms unless additional compiler directives are
+  applied. When triaging a runtime issue, enabling this feature can provide
+  more context to determine the location of the fault.
+  Example : `make zstd BACKTRACE=1`
+
+
+### Aggregation of parameters
+CLI supports aggregation of parameters i.e. `-b1`, `-e18`, and `-i1` can be joined into `-b1e18i1`.
+
+
+### Symlink shortcuts
+It's possible to invoke `zstd` through a symlink.
+When the name of the symlink has a specific value, it triggers an associated behavior.
+- `zstdmt` : compress using all cores available on local system.
+- `zcat` : will decompress and output target file using any of the supported formats. `gzcat` and `zstdcat` are also equivalent.
+- `gzip` : if zlib support is enabled, will mimic `gzip` by compressing file using `.gz` format, removing source file by default (use `--keep` to preserve). If zlib is not supported, triggers an error.
+- `xz` : if lzma support is enabled, will mimic `xz` by compressing file using `.xz` format, removing source file by default (use `--keep` to preserve). If xz is not supported, triggers an error.
+- `lzma` : if lzma support is enabled, will mimic `lzma` by compressing file using `.lzma` format, removing source file by default (use `--keep` to preserve). If lzma is not supported, triggers an error.
+- `lz4` : if lz4 support is enabled, will mimic `lz4` by compressing file using `.lz4` format. If lz4 is not supported, triggers an error.
+- `unzstd` and `unlz4` will decompress any of the supported format.
+- `ungz`, `unxz` and `unlzma` will do the same, and will also remove source file by default (use `--keep` to preserve).
+
+
+### Dictionary builder in Command Line Interface
+Zstd offers a training mode, which can be used to tune the algorithm for a selected
+type of data, by providing it with a few samples. The result of the training is stored
+in a file selected with the `-o` option (default name is `dictionary`),
+which can be loaded before compression and decompression.
+
+Using a dictionary, the compression ratio achievable on small data improves dramatically.
+These compression gains are achieved while simultaneously providing faster compression and decompression speeds.
+Dictionary work if there is some correlation in a family of small data (there is no universal dictionary).
+Hence, deploying one dictionary per type of data will provide the greater benefits.
+Dictionary gains are mostly effective in the first few KB. Then, the compression algorithm
+will rely more and more on previously decoded content to compress the rest of the file.
+
+Usage of the dictionary builder and created dictionaries with CLI:
+
+1. Create the dictionary : `zstd --train PathToTrainingSet/* -o dictionaryName`
+2. Compress with the dictionary: `zstd FILE -D dictionaryName`
+3. Decompress with the dictionary: `zstd --decompress FILE.zst -D dictionaryName`
+
+
+### Benchmark in Command Line Interface
+CLI includes in-memory compression benchmark module for zstd.
+The benchmark is conducted using given filenames. The files are read into memory and joined together.
+It makes benchmark more precise as it eliminates I/O overhead.
+Multiple filenames can be supplied, as multiple parameters, with wildcards,
+or names of directories can be used as parameters with `-r` option.
+
+The benchmark measures ratio, compressed size, compression and decompression speed.
+One can select compression levels starting from `-b` and ending with `-e`.
+The `-i` parameter selects minimal time used for each of tested levels.
+
+
+### Usage of Command Line Interface
+The full list of options can be obtained with `-h` or `-H` parameter:
+```
+Usage :
+      zstd [args] [FILE(s)] [-o file]
+
+FILE    : a filename
+          with no FILE, or when FILE is - , read standard input
+Arguments :
+ -#     : # compression level (1-19, default: 3)
+ -d     : decompression
+ -D DICT: use DICT as Dictionary for compression or decompression
+ -o file: result stored into `file` (only 1 output file)
+ -f     : overwrite output without prompting, also (de)compress links
+--rm    : remove source file(s) after successful de/compression
+ -k     : preserve source file(s) (default)
+ -h/-H  : display help/long help and exit
+
+Advanced arguments :
+ -V     : display Version number and exit
+ -c     : write to standard output (even if it is the console)
+ -v     : verbose mode; specify multiple times to increase verbosity
+ -q     : suppress warnings; specify twice to suppress errors too
+--no-progress : do not display the progress counter
+ -r     : operate recursively on directories
+--filelist FILE : read list of files to operate upon from FILE
+--output-dir-flat DIR : processed files are stored into DIR
+--output-dir-mirror DIR : processed files are stored into DIR respecting original directory structure
+--[no-]asyncio : use asynchronous IO (default: enabled)
+--[no-]check : during compression, add XXH64 integrity checksum to frame (default: enabled). If specified with -d, decompressor will ignore/validate checksums in compressed frame (default: validate).
+--      : All arguments after "--" are treated as files
+
+Advanced compression arguments :
+--ultra : enable levels beyond 19, up to 22 (requires more memory)
+--long[=#]: enable long distance matching with given window log (default: 27)
+--fast[=#]: switch to very fast compression levels (default: 1)
+--adapt : dynamically adapt compression level to I/O conditions
+--patch-from=FILE : specify the file to be used as a reference point for zstd's diff engine
+ -T#    : spawns # compression threads (default: 1, 0==# cores)
+ -B#    : select size of each job (default: 0==automatic)
+--single-thread : use a single thread for both I/O and compression (result slightly different than -T1)
+--rsyncable : compress using a rsync-friendly method (-B sets block size)
+--exclude-compressed: only compress files that are not already compressed
+--stream-size=# : specify size of streaming input from `stdin`
+--size-hint=# optimize compression parameters for streaming input of approximately this size
+--target-compressed-block-size=# : generate compressed block of approximately targeted size
+--no-dictID : don't write dictID into header (dictionary compression only)
+--[no-]compress-literals : force (un)compressed literals
+--format=zstd : compress files to the .zst format (default)
+--format=gzip : compress files to the .gz format
+--format=xz : compress files to the .xz format
+--format=lzma : compress files to the .lzma format
+--format=lz4 : compress files to the .lz4 format
+
+Advanced decompression arguments :
+ -l     : print information about zstd compressed files
+--test  : test compressed file integrity
+ -M#    : Set a memory usage limit for decompression
+--[no-]sparse : sparse mode (default: disabled)
+
+Dictionary builder :
+--train ## : create a dictionary from a training set of files
+--train-cover[=k=#,d=#,steps=#,split=#,shrink[=#]] : use the cover algorithm with optional args
+--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#,shrink[=#]] : use the fast cover algorithm with optional args
+--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: 9)
+ -o DICT : DICT is dictionary name (default: dictionary)
+--maxdict=# : limit dictionary to specified size (default: 112640)
+--dictID=# : force dictionary ID to specified value (default: random)
+
+Benchmark arguments :
+ -b#    : benchmark file(s), using # compression level (default: 3)
+ -e#    : test all compression levels successively from -b# to -e# (default: 1)
+ -i#    : minimum evaluation time in seconds (default: 3s)
+ -B#    : cut file into independent chunks of size # (default: no chunking)
+ -S     : output one benchmark result per input file (default: consolidated result)
+--priority=rt : set process priority to real-time
+```
+
+### Passing parameters through Environment Variables
+There is no "generic" way to pass "any kind of parameter" to `zstd` in a pass-through manner.
+Using environment variables for this purpose has security implications.
+Therefore, this avenue is intentionally restricted and only supports `ZSTD_CLEVEL` and `ZSTD_NBTHREADS`.
+
+`ZSTD_CLEVEL` can be used to modify the default compression level of `zstd`
+(usually set to `3`) to another value between 1 and 19 (the "normal" range).
+
+`ZSTD_NBTHREADS` can be used to specify a number of threads
+that `zstd` will use for compression, which by default is `1`.
+This functionality only exists when `zstd` is compiled with multithread support.
+`0` means "use as many threads as detected cpu cores on local system".
+The max # of threads is capped at `ZSTDMT_NBWORKERS_MAX`,
+which is either 64 in 32-bit mode, or 256 for 64-bit environments.
+
+This functionality can be useful when `zstd` CLI is invoked in a way that doesn't allow passing arguments.
+One such scenario is `tar --zstd`.
+As `ZSTD_CLEVEL` and `ZSTD_NBTHREADS` only replace the default compression level
+and number of threads respectively, they can both be overridden by corresponding command line arguments:
+`-#` for compression level and `-T#` for number of threads.
+
+
+### Long distance matching mode
+The long distance matching mode, enabled with `--long`, is designed to improve
+the compression ratio for files with long matches at a large distance (up to the
+maximum window size, `128 MiB`) while still maintaining compression speed.
+
+Enabling this mode sets the window size to `128 MiB` and thus increases the memory
+usage for both the compressor and decompressor. Performance in terms of speed is
+dependent on long matches being found. Compression speed may degrade if few long
+matches are found. Decompression speed usually improves when there are many long
+distance matches.
+
+Below are graphs comparing the compression speed, compression ratio, and
+decompression speed with and without long distance matching on an ideal use
+case: a tar of four versions of clang (versions `3.4.1`, `3.4.2`, `3.5.0`,
+`3.5.1`) with a total size of `244889600 B`. This is an ideal use case as there
+are many long distance matches within the maximum window size of `128 MiB` (each
+version is less than `128 MiB`).
+
+Compression Speed vs Ratio | Decompression Speed
+---------------------------|---------------------
+![Compression Speed vs Ratio](https://raw.githubusercontent.com/facebook/zstd/v1.3.3/doc/images/ldmCspeed.png "Compression Speed vs Ratio") | ![Decompression Speed](https://raw.githubusercontent.com/facebook/zstd/v1.3.3/doc/images/ldmDspeed.png "Decompression Speed")
+
+| Method | Compression ratio | Compression speed | Decompression speed  |
+|:-------|------------------:|-------------------------:|---------------------------:|
+| `zstd -1`  | `5.065`    | `284.8 MB/s`  | `759.3 MB/s`  |
+| `zstd -5`  | `5.826`    | `124.9 MB/s`  | `674.0 MB/s`  |
+| `zstd -10` | `6.504`    | `29.5 MB/s`   | `771.3 MB/s`  |
+| `zstd -1 --long` | `17.426` | `220.6 MB/s` | `1638.4 MB/s` |
+| `zstd -5 --long` | `19.661` | `165.5 MB/s` | `1530.6 MB/s` |
+| `zstd -10 --long`| `21.949` |  `75.6 MB/s` | `1632.6 MB/s` |
+
+On this file, the compression ratio improves significantly with minimal impact
+on compression speed, and the decompression speed doubles.
+
+On the other extreme, compressing a file with few long distance matches (such as
+the [Silesia compression corpus]) will likely lead to a deterioration in
+compression speed (for lower levels) with minimal change in compression ratio.
+
+The below table illustrates this on the [Silesia compression corpus].
+
+[Silesia compression corpus]: https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia
+
+| Method | Compression ratio | Compression speed | Decompression speed  |
+|:-------|------------------:|------------------:|---------------------:|
+| `zstd -1`        | `2.878` | `231.7 MB/s`      | `594.4 MB/s`   |
+| `zstd -1 --long` | `2.929` | `106.5 MB/s`      | `517.9 MB/s`   |
+| `zstd -5`        | `3.274` | `77.1 MB/s`       | `464.2 MB/s`   |
+| `zstd -5 --long` | `3.319` | `51.7 MB/s`       | `371.9 MB/s`   |
+| `zstd -10`       | `3.523` | `16.4 MB/s`       | `489.2 MB/s`   |
+| `zstd -10 --long`| `3.566` | `16.2 MB/s`       | `415.7 MB/s`   |
+
+
+### zstdgrep
+
+`zstdgrep` is a utility which makes it possible to `grep` directly a `.zst` compressed file.
+It's used the same way as normal `grep`, for example :
+`zstdgrep pattern file.zst`
+
+`zstdgrep` is _not_ compatible with dictionary compression.
+
+To search into a file compressed with a dictionary,
+it's necessary to decompress it using `zstd` or `zstdcat`,
+and then pipe the result to `grep`. For example  :
+`zstdcat -D dictionary -qc -- file.zst | grep pattern`
--- a/third_party/zstd/programs/benchfn.c
+++ b/third_party/zstd/programs/benchfn.c
@ -0,0 +1,280 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/* *************************************
+*  Includes
+***************************************/
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"      /* malloc, free */
+#include "libc/mem/alg.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h"      /* memset */
+#include "libc/assert.h"      /* assert */
+
+#include "third_party/zstd/programs/timefn.h"        /* UTIL_time_t, UTIL_getTime */
+#include "third_party/zstd/programs/benchfn.h"
+
+
+/* *************************************
+*  Constants
+***************************************/
+#define TIMELOOP_MICROSEC     SEC_TO_MICRO      /* 1 second */
+#define TIMELOOP_NANOSEC      (1*1000000000ULL) /* 1 second */
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+
+/* *************************************
+*  Debug errors
+***************************************/
+#if defined(DEBUG) && (DEBUG >= 1)
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"       /* fprintf */
+#  define DISPLAY(...)       fprintf(stderr, __VA_ARGS__)
+#  define DEBUGOUTPUT(...) { if (DEBUG) DISPLAY(__VA_ARGS__); }
+#else
+#  define DEBUGOUTPUT(...)
+#endif
+
+
+/* error without displaying */
+#define RETURN_QUIET_ERROR(retValue, ...) {           \
+    DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
+    DEBUGOUTPUT("Error : ");                          \
+    DEBUGOUTPUT(__VA_ARGS__);                         \
+    DEBUGOUTPUT(" \n");                               \
+    return retValue;                                  \
+}
+
+/* Abort execution if a condition is not met */
+#define CONTROL(c) { if (!(c)) { DEBUGOUTPUT("error: %s \n", #c); abort(); } }
+
+
+/* *************************************
+*  Benchmarking an arbitrary function
+***************************************/
+
+int BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome)
+{
+    return outcome.error_tag_never_ever_use_directly == 0;
+}
+
+/* warning : this function will stop program execution if outcome is invalid !
+ *           check outcome validity first, using BMK_isValid_runResult() */
+BMK_runTime_t BMK_extract_runTime(BMK_runOutcome_t outcome)
+{
+    CONTROL(outcome.error_tag_never_ever_use_directly == 0);
+    return outcome.internal_never_ever_use_directly;
+}
+
+size_t BMK_extract_errorResult(BMK_runOutcome_t outcome)
+{
+    CONTROL(outcome.error_tag_never_ever_use_directly != 0);
+    return outcome.error_result_never_ever_use_directly;
+}
+
+static BMK_runOutcome_t BMK_runOutcome_error(size_t errorResult)
+{
+    BMK_runOutcome_t b;
+    memset(&b, 0, sizeof(b));
+    b.error_tag_never_ever_use_directly = 1;
+    b.error_result_never_ever_use_directly = errorResult;
+    return b;
+}
+
+static BMK_runOutcome_t BMK_setValid_runTime(BMK_runTime_t runTime)
+{
+    BMK_runOutcome_t outcome;
+    outcome.error_tag_never_ever_use_directly = 0;
+    outcome.internal_never_ever_use_directly = runTime;
+    return outcome;
+}
+
+
+/* initFn will be measured once, benchFn will be measured `nbLoops` times */
+/* initFn is optional, provide NULL if none */
+/* benchFn must return a size_t value that errorFn can interpret */
+/* takes # of blocks and list of size & stuff for each. */
+/* can report result of benchFn for each block into blockResult. */
+/* blockResult is optional, provide NULL if this information is not required */
+/* note : time per loop can be reported as zero if run time < timer resolution */
+BMK_runOutcome_t BMK_benchFunction(BMK_benchParams_t p,
+                                   unsigned nbLoops)
+{
+    size_t dstSize = 0;
+    nbLoops += !nbLoops;   /* minimum nbLoops is 1 */
+
+    /* init */
+    {   size_t i;
+        for(i = 0; i < p.blockCount; i++) {
+            memset(p.dstBuffers[i], 0xE5, p.dstCapacities[i]);  /* warm up and erase result buffer */
+    }   }
+
+    /* benchmark */
+    {   UTIL_time_t const clockStart = UTIL_getTime();
+        unsigned loopNb, blockNb;
+        if (p.initFn != NULL) p.initFn(p.initPayload);
+        for (loopNb = 0; loopNb < nbLoops; loopNb++) {
+            for (blockNb = 0; blockNb < p.blockCount; blockNb++) {
+                size_t const res = p.benchFn(p.srcBuffers[blockNb], p.srcSizes[blockNb],
+                                   p.dstBuffers[blockNb], p.dstCapacities[blockNb],
+                                   p.benchPayload);
+                if (loopNb == 0) {
+                    if (p.blockResults != NULL) p.blockResults[blockNb] = res;
+                    if ((p.errorFn != NULL) && (p.errorFn(res))) {
+                        RETURN_QUIET_ERROR(BMK_runOutcome_error(res),
+                            "Function benchmark failed on block %u (of size %u) with error %i",
+                            blockNb, (unsigned)p.srcSizes[blockNb], (int)res);
+                    }
+                    dstSize += res;
+            }   }
+        }  /* for (loopNb = 0; loopNb < nbLoops; loopNb++) */
+
+        {   PTime const totalTime = UTIL_clockSpanNano(clockStart);
+            BMK_runTime_t rt;
+            rt.nanoSecPerRun = (double)totalTime / nbLoops;
+            rt.sumOfReturn = dstSize;
+            return BMK_setValid_runTime(rt);
+    }   }
+}
+
+
+/* ====  Benchmarking any function, providing intermediate results  ==== */
+
+struct BMK_timedFnState_s {
+    PTime timeSpent_ns;
+    PTime timeBudget_ns;
+    PTime runBudget_ns;
+    BMK_runTime_t fastestRun;
+    unsigned nbLoops;
+    UTIL_time_t coolTime;
+};  /* typedef'd to BMK_timedFnState_t within bench.h */
+
+BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms)
+{
+    BMK_timedFnState_t* const r = (BMK_timedFnState_t*)malloc(sizeof(*r));
+    if (r == NULL) return NULL;   /* malloc() error */
+    BMK_resetTimedFnState(r, total_ms, run_ms);
+    return r;
+}
+
+void BMK_freeTimedFnState(BMK_timedFnState_t* state) { free(state); }
+
+BMK_timedFnState_t*
+BMK_initStatic_timedFnState(void* buffer, size_t size, unsigned total_ms, unsigned run_ms)
+{
+    typedef char check_size[ 2 * (sizeof(BMK_timedFnState_shell) >= sizeof(struct BMK_timedFnState_s)) - 1];  /* static assert : a compilation failure indicates that BMK_timedFnState_shell is not large enough */
+    typedef struct { check_size c; BMK_timedFnState_t tfs; } tfs_align;  /* force tfs to be aligned at its next best position */
+    size_t const tfs_alignment = offsetof(tfs_align, tfs); /* provides the minimal alignment restriction for BMK_timedFnState_t */
+    BMK_timedFnState_t* const r = (BMK_timedFnState_t*)buffer;
+    if (buffer == NULL) return NULL;
+    if (size < sizeof(struct BMK_timedFnState_s)) return NULL;
+    if ((size_t)buffer % tfs_alignment) return NULL;  /* buffer must be properly aligned */
+    BMK_resetTimedFnState(r, total_ms, run_ms);
+    return r;
+}
+
+void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms)
+{
+    if (!total_ms) total_ms = 1 ;
+    if (!run_ms) run_ms = 1;
+    if (run_ms > total_ms) run_ms = total_ms;
+    timedFnState->timeSpent_ns = 0;
+    timedFnState->timeBudget_ns = (PTime)total_ms * TIMELOOP_NANOSEC / 1000;
+    timedFnState->runBudget_ns = (PTime)run_ms * TIMELOOP_NANOSEC / 1000;
+    timedFnState->fastestRun.nanoSecPerRun = (double)TIMELOOP_NANOSEC * 2000000000;  /* hopefully large enough : must be larger than any potential measurement */
+    timedFnState->fastestRun.sumOfReturn = (size_t)(-1LL);
+    timedFnState->nbLoops = 1;
+    timedFnState->coolTime = UTIL_getTime();
+}
+
+/* Tells if nb of seconds set in timedFnState for all runs is spent.
+ * note : this function will return 1 if BMK_benchFunctionTimed() has actually errored. */
+int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState)
+{
+    return (timedFnState->timeSpent_ns >= timedFnState->timeBudget_ns);
+}
+
+
+#undef MIN
+#define MIN(a,b)   ( (a) < (b) ? (a) : (b) )
+
+#define MINUSABLETIME  (TIMELOOP_NANOSEC / 2)  /* 0.5 seconds */
+
+BMK_runOutcome_t BMK_benchTimedFn(BMK_timedFnState_t* cont,
+                                  BMK_benchParams_t p)
+{
+    PTime const runBudget_ns = cont->runBudget_ns;
+    PTime const runTimeMin_ns = runBudget_ns / 2;
+    int completed = 0;
+    BMK_runTime_t bestRunTime = cont->fastestRun;
+
+    while (!completed) {
+        BMK_runOutcome_t const runResult = BMK_benchFunction(p, cont->nbLoops);
+
+        if(!BMK_isSuccessful_runOutcome(runResult)) { /* error : move out */
+            return runResult;
+        }
+
+        {   BMK_runTime_t const newRunTime = BMK_extract_runTime(runResult);
+            double const loopDuration_ns = newRunTime.nanoSecPerRun * cont->nbLoops;
+
+            cont->timeSpent_ns += (unsigned long long)loopDuration_ns;
+
+            /* estimate nbLoops for next run to last approximately 1 second */
+            if (loopDuration_ns > ((double)runBudget_ns / 50)) {
+                double const fastestRun_ns = MIN(bestRunTime.nanoSecPerRun, newRunTime.nanoSecPerRun);
+                cont->nbLoops = (unsigned)((double)runBudget_ns / fastestRun_ns) + 1;
+            } else {
+                /* previous run was too short : blindly increase workload by x multiplier */
+                const unsigned multiplier = 10;
+                assert(cont->nbLoops < ((unsigned)-1) / multiplier);  /* avoid overflow */
+                cont->nbLoops *= multiplier;
+            }
+
+            if(loopDuration_ns < (double)runTimeMin_ns) {
+                /* don't report results for which benchmark run time was too small : increased risks of rounding errors */
+                assert(completed == 0);
+                continue;
+            } else {
+                if(newRunTime.nanoSecPerRun < bestRunTime.nanoSecPerRun) {
+                    bestRunTime = newRunTime;
+                }
+                completed = 1;
+            }
+        }
+    }   /* while (!completed) */
+
+    return BMK_setValid_runTime(bestRunTime);
+}
--- a/third_party/zstd/programs/benchfn.h
+++ b/third_party/zstd/programs/benchfn.h
@ -0,0 +1,184 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* benchfn :
+ * benchmark any function on a set of input
+ * providing result in nanoSecPerRun
+ * or detecting and returning an error
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef BENCH_FN_H_23876
+#define BENCH_FN_H_23876
+
+/* ===  Dependencies  === */
+   /* size_t */
+
+
+/* ====  Benchmark any function, iterated on a set of blocks  ==== */
+
+/* BMK_runTime_t: valid result return type */
+
+typedef struct {
+    double nanoSecPerRun;  /* time per iteration (over all blocks) */
+    size_t sumOfReturn;         /* sum of return values */
+} BMK_runTime_t;
+
+
+/* BMK_runOutcome_t:
+ * type expressing the outcome of a benchmark run by BMK_benchFunction(),
+ * which can be either valid or invalid.
+ * benchmark outcome can be invalid if errorFn is provided.
+ * BMK_runOutcome_t must be considered "opaque" : never access its members directly.
+ * Instead, use its assigned methods :
+ * BMK_isSuccessful_runOutcome, BMK_extract_runTime, BMK_extract_errorResult.
+ * The structure is only described here to allow its allocation on stack. */
+
+typedef struct {
+    BMK_runTime_t internal_never_ever_use_directly;
+    size_t error_result_never_ever_use_directly;
+    int error_tag_never_ever_use_directly;
+} BMK_runOutcome_t;
+
+
+/* prototypes for benchmarked functions */
+typedef size_t (*BMK_benchFn_t)(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* customPayload);
+typedef size_t (*BMK_initFn_t)(void* initPayload);
+typedef unsigned (*BMK_errorFn_t)(size_t);
+
+
+/* BMK_benchFunction() parameters are provided via the following structure.
+ * A structure is preferable for readability,
+ * as the number of parameters required is fairly large.
+ * No initializer is provided, because it doesn't make sense to provide some "default" :
+ * all parameters must be specified by the caller.
+ * optional parameters are labelled explicitly, and accept value NULL when not used */
+typedef struct {
+    BMK_benchFn_t benchFn;    /* the function to benchmark, over the set of blocks */
+    void* benchPayload;       /* pass custom parameters to benchFn  :
+                               * (*benchFn)(srcBuffers[i], srcSizes[i], dstBuffers[i], dstCapacities[i], benchPayload) */
+    BMK_initFn_t initFn;      /* (*initFn)(initPayload) is run once per run, at the beginning. */
+    void* initPayload;        /* Both arguments can be NULL, in which case nothing is run. */
+    BMK_errorFn_t errorFn;    /* errorFn will check each return value of benchFn over each block, to determine if it failed or not.
+                               * errorFn can be NULL, in which case no check is performed.
+                               * errorFn must return 0 when benchFn was successful, and >= 1 if it detects an error.
+                               * Execution is stopped as soon as an error is detected.
+                               * the triggering return value can be retrieved using BMK_extract_errorResult(). */
+    size_t blockCount;        /* number of blocks to operate benchFn on.
+                               * It's also the size of all array parameters :
+                               * srcBuffers, srcSizes, dstBuffers, dstCapacities, blockResults */
+    const void *const * srcBuffers; /* read-only array of buffers to be operated on by benchFn */
+    const size_t* srcSizes;   /* read-only array containing sizes of srcBuffers */
+    void *const * dstBuffers; /* array of buffers to be written into by benchFn. This array is not optional, it must be provided even if unused by benchfn. */
+    const size_t* dstCapacities; /* read-only array containing capacities of dstBuffers. This array must be present. */
+    size_t* blockResults;     /* Optional: store the return value of benchFn for each block. Use NULL if this result is not requested. */
+} BMK_benchParams_t;
+
+
+/* BMK_benchFunction() :
+ * This function benchmarks benchFn and initFn, providing a result.
+ *
+ * params : see description of BMK_benchParams_t above.
+ * nbLoops: defines number of times benchFn is run over the full set of blocks.
+ *          Minimum value is 1. A 0 is interpreted as a 1.
+ *
+ * @return: can express either an error or a successful result.
+ *          Use BMK_isSuccessful_runOutcome() to check if benchmark was successful.
+ *          If yes, extract the result with BMK_extract_runTime(),
+ *          it will contain :
+ *              .sumOfReturn : the sum of all return values of benchFn through all of blocks
+ *              .nanoSecPerRun : time per run of benchFn + (time for initFn / nbLoops)
+ *          .sumOfReturn is generally intended for functions which return a # of bytes written into dstBuffer,
+ *              in which case, this value will be the total amount of bytes written into dstBuffer.
+ *
+ * blockResults : when provided (!= NULL), and when benchmark is successful,
+ *                params.blockResults contains all return values of `benchFn` over all blocks.
+ *                when provided (!= NULL), and when benchmark failed,
+ *                params.blockResults contains return values of `benchFn` over all blocks preceding and including the failed block.
+ */
+BMK_runOutcome_t BMK_benchFunction(BMK_benchParams_t params, unsigned nbLoops);
+
+
+
+/* check first if the benchmark was successful or not */
+int BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome);
+
+/* If the benchmark was successful, extract the result.
+ * note : this function will abort() program execution if benchmark failed !
+ *        always check if benchmark was successful first !
+ */
+BMK_runTime_t BMK_extract_runTime(BMK_runOutcome_t outcome);
+
+/* when benchmark failed, it means one invocation of `benchFn` failed.
+ * The failure was detected by `errorFn`, operating on return values of `benchFn`.
+ * Returns the faulty return value.
+ * note : this function will abort() program execution if benchmark did not fail.
+ *        always check if benchmark failed first !
+ */
+size_t BMK_extract_errorResult(BMK_runOutcome_t outcome);
+
+
+
+/* ====  Benchmark any function, returning intermediate results  ==== */
+
+/* state information tracking benchmark session */
+typedef struct BMK_timedFnState_s BMK_timedFnState_t;
+
+/* BMK_benchTimedFn() :
+ * Similar to BMK_benchFunction(), most arguments being identical.
+ * Automatically determines `nbLoops` so that each result is regularly produced at interval of about run_ms.
+ * Note : minimum `nbLoops` is 1, therefore a run may last more than run_ms, and possibly even more than total_ms.
+ * Usage - initialize timedFnState, select benchmark duration (total_ms) and each measurement duration (run_ms)
+ *         call BMK_benchTimedFn() repetitively, each measurement is supposed to last about run_ms
+ *         Check if total time budget is spent or exceeded, using BMK_isCompleted_TimedFn()
+ */
+BMK_runOutcome_t BMK_benchTimedFn(BMK_timedFnState_t* timedFnState,
+                                  BMK_benchParams_t params);
+
+/* Tells if duration of all benchmark runs has exceeded total_ms
+ */
+int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState);
+
+/* BMK_createTimedFnState() and BMK_resetTimedFnState() :
+ * Create/Set BMK_timedFnState_t for next benchmark session,
+ * which shall last a minimum of total_ms milliseconds,
+ * producing intermediate results, paced at interval of (approximately) run_ms.
+ */
+BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms);
+void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms);
+void BMK_freeTimedFnState(BMK_timedFnState_t* state);
+
+
+/* BMK_timedFnState_shell and BMK_initStatic_timedFnState() :
+ * Makes it possible to statically allocate a BMK_timedFnState_t on stack.
+ * BMK_timedFnState_shell is only there to allocate space,
+ * never ever access its members.
+ * BMK_timedFnState_t() actually accepts any buffer.
+ * It will check if provided buffer is large enough and is correctly aligned,
+ * and will return NULL if conditions are not respected.
+ */
+#define BMK_TIMEDFNSTATE_SIZE 64
+typedef union {
+    char never_access_space[BMK_TIMEDFNSTATE_SIZE];
+    long long alignment_enforcer;  /* must be aligned on 8-bytes boundaries */
+} BMK_timedFnState_shell;
+BMK_timedFnState_t* BMK_initStatic_timedFnState(void* buffer, size_t size, unsigned total_ms, unsigned run_ms);
+
+
+#endif   /* BENCH_FN_H_23876 */
+
+#if defined (__cplusplus)
+}
+#endif
--- a/third_party/zstd/programs/benchzstd.c
+++ b/third_party/zstd/programs/benchzstd.c
@ -0,0 +1,938 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* **************************************
+*  Tuning parameters
+****************************************/
+#ifndef BMK_TIMETEST_DEFAULT_S   /* default minimum time per test */
+# define BMK_TIMETEST_DEFAULT_S 3
+#endif
+
+
+/* *************************************
+*  Includes
+***************************************/
+#include "third_party/zstd/programs/platform.h"    /* Large Files support */
+#include "third_party/zstd/programs/util.h"        /* UTIL_getFileSize, UTIL_sleep */
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"      /* malloc, free */
+#include "libc/mem/alg.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h"      /* memset, strerror */
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"       /* fprintf, fopen */
+#include "libc/errno.h"
+#include "libc/assert.h"      /* assert */
+
+#include "third_party/zstd/programs/timefn.h"      /* UTIL_time_t */
+#include "third_party/zstd/programs/benchfn.h"
+#include "third_party/zstd/lib/common/mem.h"
+#ifndef ZSTD_STATIC_LINKING_ONLY
+#define ZSTD_STATIC_LINKING_ONLY
+#endif
+#include "third_party/zstd/zstd.h"
+#include "third_party/zstd/programs/datagen.h"     /* RDG_genBuffer */
+#ifndef XXH_INLINE_ALL
+#define XXH_INLINE_ALL
+#endif
+#include "third_party/zstd/lib/common/xxhash.h"
+#include "third_party/zstd/programs/benchzstd.h"
+#include "third_party/zstd/zstd_errors.h"
+
+
+/* *************************************
+*  Constants
+***************************************/
+#ifndef ZSTD_GIT_COMMIT
+#  define ZSTD_GIT_COMMIT_STRING ""
+#else
+#  define ZSTD_GIT_COMMIT_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_GIT_COMMIT)
+#endif
+
+#define TIMELOOP_MICROSEC     (1*1000000ULL) /* 1 second */
+#define TIMELOOP_NANOSEC      (1*1000000000ULL) /* 1 second */
+#define ACTIVEPERIOD_MICROSEC (70*TIMELOOP_MICROSEC) /* 70 seconds */
+#define COOLPERIOD_SEC        10
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BMK_RUNTEST_DEFAULT_MS 1000
+
+static const size_t maxMemory = (sizeof(size_t)==4)  ?
+                    /* 32-bit */ (2 GB - 64 MB) :
+                    /* 64-bit */ (size_t)(1ULL << ((sizeof(size_t)*8)-31));
+
+
+/* *************************************
+*  console display
+***************************************/
+#define DISPLAY(...)         { fprintf(stderr, __VA_ARGS__); fflush(NULL); }
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+/* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
+#define OUTPUT(...)          { fprintf(stdout, __VA_ARGS__); fflush(NULL); }
+#define OUTPUTLEVEL(l, ...)  if (displayLevel>=l) { OUTPUT(__VA_ARGS__); }
+
+
+/* *************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) { if (DEBUG) DISPLAY(__VA_ARGS__); }
+
+#define RETURN_ERROR_INT(errorNum, ...)  {               \
+    DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
+    DISPLAYLEVEL(1, "Error %i : ", errorNum);         \
+    DISPLAYLEVEL(1, __VA_ARGS__);                     \
+    DISPLAYLEVEL(1, " \n");                           \
+    return errorNum;                                  \
+}
+
+#define CHECK_Z(zf) {              \
+    size_t const zerr = zf;        \
+    if (ZSTD_isError(zerr)) {      \
+        DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);  \
+        DISPLAY("Error : ");       \
+        DISPLAY("%s failed : %s",  \
+                #zf, ZSTD_getErrorName(zerr));   \
+        DISPLAY(" \n");            \
+        exit(1);                   \
+    }                              \
+}
+
+#define RETURN_ERROR(errorNum, retType, ...)  {       \
+    retType r;                                        \
+    memset(&r, 0, sizeof(retType));                   \
+    DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
+    DISPLAYLEVEL(1, "Error %i : ", errorNum);         \
+    DISPLAYLEVEL(1, __VA_ARGS__);                     \
+    DISPLAYLEVEL(1, " \n");                           \
+    r.tag = errorNum;                                 \
+    return r;                                         \
+}
+
+
+/* *************************************
+*  Benchmark Parameters
+***************************************/
+
+BMK_advancedParams_t BMK_initAdvancedParams(void) {
+    BMK_advancedParams_t const res = {
+        BMK_both, /* mode */
+        BMK_TIMETEST_DEFAULT_S, /* nbSeconds */
+        0, /* blockSize */
+        0, /* nbWorkers */
+        0, /* realTime */
+        0, /* additionalParam */
+        0, /* ldmFlag */
+        0, /* ldmMinMatch */
+        0, /* ldmHashLog */
+        0, /* ldmBuckSizeLog */
+        0,  /* ldmHashRateLog */
+        ZSTD_ps_auto, /* literalCompressionMode */
+        0 /* useRowMatchFinder */
+    };
+    return res;
+}
+
+
+/* ********************************************************
+*  Bench functions
+**********************************************************/
+typedef struct {
+    const void* srcPtr;
+    size_t srcSize;
+    void*  cPtr;
+    size_t cRoom;
+    size_t cSize;
+    void*  resPtr;
+    size_t resSize;
+} blockParam_t;
+
+#undef MIN
+#undef MAX
+#define MIN(a,b)    ((a) < (b) ? (a) : (b))
+#define MAX(a,b)    ((a) > (b) ? (a) : (b))
+
+static void
+BMK_initCCtx(ZSTD_CCtx* ctx,
+            const void* dictBuffer, size_t dictBufferSize,
+            int cLevel,
+            const ZSTD_compressionParameters* comprParams,
+            const BMK_advancedParams_t* adv)
+{
+    ZSTD_CCtx_reset(ctx, ZSTD_reset_session_and_parameters);
+    if (adv->nbWorkers==1) {
+        CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_nbWorkers, 0));
+    } else {
+        CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_nbWorkers, adv->nbWorkers));
+    }
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_compressionLevel, cLevel));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_useRowMatchFinder, adv->useRowMatchFinder));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_enableLongDistanceMatching, adv->ldmFlag));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_ldmMinMatch, adv->ldmMinMatch));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_ldmHashLog, adv->ldmHashLog));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_ldmBucketSizeLog, adv->ldmBucketSizeLog));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_ldmHashRateLog, adv->ldmHashRateLog));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_windowLog, (int)comprParams->windowLog));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_hashLog, (int)comprParams->hashLog));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_chainLog, (int)comprParams->chainLog));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_searchLog, (int)comprParams->searchLog));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_minMatch, (int)comprParams->minMatch));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_targetLength, (int)comprParams->targetLength));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_literalCompressionMode, (int)adv->literalCompressionMode));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_strategy, (int)comprParams->strategy));
+    CHECK_Z(ZSTD_CCtx_loadDictionary(ctx, dictBuffer, dictBufferSize));
+}
+
+static void BMK_initDCtx(ZSTD_DCtx* dctx,
+    const void* dictBuffer, size_t dictBufferSize) {
+    CHECK_Z(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_and_parameters));
+    CHECK_Z(ZSTD_DCtx_loadDictionary(dctx, dictBuffer, dictBufferSize));
+}
+
+
+typedef struct {
+    ZSTD_CCtx* cctx;
+    const void* dictBuffer;
+    size_t dictBufferSize;
+    int cLevel;
+    const ZSTD_compressionParameters* comprParams;
+    const BMK_advancedParams_t* adv;
+} BMK_initCCtxArgs;
+
+static size_t local_initCCtx(void* payload) {
+    BMK_initCCtxArgs* ag = (BMK_initCCtxArgs*)payload;
+    BMK_initCCtx(ag->cctx, ag->dictBuffer, ag->dictBufferSize, ag->cLevel, ag->comprParams, ag->adv);
+    return 0;
+}
+
+typedef struct {
+    ZSTD_DCtx* dctx;
+    const void* dictBuffer;
+    size_t dictBufferSize;
+} BMK_initDCtxArgs;
+
+static size_t local_initDCtx(void* payload) {
+    BMK_initDCtxArgs* ag = (BMK_initDCtxArgs*)payload;
+    BMK_initDCtx(ag->dctx, ag->dictBuffer, ag->dictBufferSize);
+    return 0;
+}
+
+
+/* `addArgs` is the context */
+static size_t local_defaultCompress(
+                    const void* srcBuffer, size_t srcSize,
+                    void* dstBuffer, size_t dstSize,
+                    void* addArgs)
+{
+    ZSTD_CCtx* const cctx = (ZSTD_CCtx*)addArgs;
+    return ZSTD_compress2(cctx, dstBuffer, dstSize, srcBuffer, srcSize);
+}
+
+/* `addArgs` is the context */
+static size_t local_defaultDecompress(
+                    const void* srcBuffer, size_t srcSize,
+                    void* dstBuffer, size_t dstCapacity,
+                    void* addArgs)
+{
+    size_t moreToFlush = 1;
+    ZSTD_DCtx* const dctx = (ZSTD_DCtx*)addArgs;
+    ZSTD_inBuffer in;
+    ZSTD_outBuffer out;
+    in.src = srcBuffer; in.size = srcSize; in.pos = 0;
+    out.dst = dstBuffer; out.size = dstCapacity; out.pos = 0;
+    while (moreToFlush) {
+        if(out.pos == out.size) {
+            return (size_t)-ZSTD_error_dstSize_tooSmall;
+        }
+        moreToFlush = ZSTD_decompressStream(dctx, &out, &in);
+        if (ZSTD_isError(moreToFlush)) {
+            return moreToFlush;
+        }
+    }
+    return out.pos;
+
+}
+
+
+/* ================================================================= */
+/*      Benchmark Zstandard, mem-to-mem scenarios                    */
+/* ================================================================= */
+
+int BMK_isSuccessful_benchOutcome(BMK_benchOutcome_t outcome)
+{
+    return outcome.tag == 0;
+}
+
+BMK_benchResult_t BMK_extract_benchResult(BMK_benchOutcome_t outcome)
+{
+    assert(outcome.tag == 0);
+    return outcome.internal_never_use_directly;
+}
+
+static BMK_benchOutcome_t BMK_benchOutcome_error(void)
+{
+    BMK_benchOutcome_t b;
+    memset(&b, 0, sizeof(b));
+    b.tag = 1;
+    return b;
+}
+
+static BMK_benchOutcome_t BMK_benchOutcome_setValidResult(BMK_benchResult_t result)
+{
+    BMK_benchOutcome_t b;
+    b.tag = 0;
+    b.internal_never_use_directly = result;
+    return b;
+}
+
+
+/* benchMem with no allocation */
+static BMK_benchOutcome_t
+BMK_benchMemAdvancedNoAlloc(
+                    const void** srcPtrs, size_t* srcSizes,
+                    void** cPtrs, size_t* cCapacities, size_t* cSizes,
+                    void** resPtrs, size_t* resSizes,
+                    void** resultBufferPtr, void* compressedBuffer,
+                    size_t maxCompressedSize,
+                    BMK_timedFnState_t* timeStateCompress,
+                    BMK_timedFnState_t* timeStateDecompress,
+
+                    const void* srcBuffer, size_t srcSize,
+                    const size_t* fileSizes, unsigned nbFiles,
+                    const int cLevel,
+                    const ZSTD_compressionParameters* comprParams,
+                    const void* dictBuffer, size_t dictBufferSize,
+                    ZSTD_CCtx* cctx, ZSTD_DCtx* dctx,
+                    int displayLevel, const char* displayName,
+                    const BMK_advancedParams_t* adv)
+{
+    size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize);  /* avoid div by 0 */
+    BMK_benchResult_t benchResult;
+    size_t const loadedCompressedSize = srcSize;
+    size_t cSize = 0;
+    double ratio = 0.;
+    U32 nbBlocks;
+
+    assert(cctx != NULL); assert(dctx != NULL);
+
+    /* init */
+    memset(&benchResult, 0, sizeof(benchResult));
+    if (strlen(displayName)>17) displayName += strlen(displayName) - 17;   /* display last 17 characters */
+    if (adv->mode == BMK_decodeOnly) {
+        /* benchmark only decompression : source must be already compressed */
+        const char* srcPtr = (const char*)srcBuffer;
+        U64 totalDSize64 = 0;
+        U32 fileNb;
+        for (fileNb=0; fileNb<nbFiles; fileNb++) {
+            U64 const fSize64 = ZSTD_findDecompressedSize(srcPtr, fileSizes[fileNb]);
+            if (fSize64 == ZSTD_CONTENTSIZE_UNKNOWN) {
+                RETURN_ERROR(32, BMK_benchOutcome_t, "Decompressed size cannot be determined: cannot benchmark");
+            }
+            if (fSize64 == ZSTD_CONTENTSIZE_ERROR) {
+                RETURN_ERROR(32, BMK_benchOutcome_t, "Error while trying to assess decompressed size: data may be invalid");
+            }
+            totalDSize64 += fSize64;
+            srcPtr += fileSizes[fileNb];
+        }
+        {   size_t const decodedSize = (size_t)totalDSize64;
+            assert((U64)decodedSize == totalDSize64);   /* check overflow */
+            free(*resultBufferPtr);
+            if (totalDSize64 > decodedSize) {  /* size_t overflow */
+                RETURN_ERROR(32, BMK_benchOutcome_t, "decompressed size is too large for local system");
+            }
+            *resultBufferPtr = malloc(decodedSize);
+            if (!(*resultBufferPtr)) {
+                RETURN_ERROR(33, BMK_benchOutcome_t, "allocation error: not enough memory");
+            }
+            cSize = srcSize;
+            srcSize = decodedSize;
+            ratio = (double)srcSize / (double)cSize;
+        }
+    }
+
+    /* Init data blocks  */
+    {   const char* srcPtr = (const char*)srcBuffer;
+        char* cPtr = (char*)compressedBuffer;
+        char* resPtr = (char*)(*resultBufferPtr);
+        U32 fileNb;
+        for (nbBlocks=0, fileNb=0; fileNb<nbFiles; fileNb++) {
+            size_t remaining = fileSizes[fileNb];
+            U32 const nbBlocksforThisFile = (adv->mode == BMK_decodeOnly) ? 1 : (U32)((remaining + (blockSize-1)) / blockSize);
+            U32 const blockEnd = nbBlocks + nbBlocksforThisFile;
+            for ( ; nbBlocks<blockEnd; nbBlocks++) {
+                size_t const thisBlockSize = MIN(remaining, blockSize);
+                srcPtrs[nbBlocks] = srcPtr;
+                srcSizes[nbBlocks] = thisBlockSize;
+                cPtrs[nbBlocks] = cPtr;
+                cCapacities[nbBlocks] = (adv->mode == BMK_decodeOnly) ? thisBlockSize : ZSTD_compressBound(thisBlockSize);
+                resPtrs[nbBlocks] = resPtr;
+                resSizes[nbBlocks] = (adv->mode == BMK_decodeOnly) ? (size_t) ZSTD_findDecompressedSize(srcPtr, thisBlockSize) : thisBlockSize;
+                srcPtr += thisBlockSize;
+                cPtr += cCapacities[nbBlocks];
+                resPtr += thisBlockSize;
+                remaining -= thisBlockSize;
+                if (adv->mode == BMK_decodeOnly) {
+                    cSizes[nbBlocks] = thisBlockSize;
+                    benchResult.cSize = thisBlockSize;
+    }   }   }   }
+
+    /* warming up `compressedBuffer` */
+    if (adv->mode == BMK_decodeOnly) {
+        memcpy(compressedBuffer, srcBuffer, loadedCompressedSize);
+    } else {
+        RDG_genBuffer(compressedBuffer, maxCompressedSize, 0.10, 0.50, 1);
+    }
+
+    if (!UTIL_support_MT_measurements() && adv->nbWorkers > 1) {
+        OUTPUTLEVEL(2, "Warning : time measurements may be incorrect in multithreading mode... \n")
+    }
+
+    /* Bench */
+    {   U64 const crcOrig = (adv->mode == BMK_decodeOnly) ? 0 : XXH64(srcBuffer, srcSize, 0);
+#       define NB_MARKS 4
+        const char* marks[NB_MARKS] = { " |", " /", " =", " \\" };
+        U32 markNb = 0;
+        int compressionCompleted = (adv->mode == BMK_decodeOnly);
+        int decompressionCompleted = (adv->mode == BMK_compressOnly);
+        BMK_benchParams_t cbp, dbp;
+        BMK_initCCtxArgs cctxprep;
+        BMK_initDCtxArgs dctxprep;
+
+        cbp.benchFn = local_defaultCompress;   /* ZSTD_compress2 */
+        cbp.benchPayload = cctx;
+        cbp.initFn = local_initCCtx;   /* BMK_initCCtx */
+        cbp.initPayload = &cctxprep;
+        cbp.errorFn = ZSTD_isError;
+        cbp.blockCount = nbBlocks;
+        cbp.srcBuffers = srcPtrs;
+        cbp.srcSizes = srcSizes;
+        cbp.dstBuffers = cPtrs;
+        cbp.dstCapacities = cCapacities;
+        cbp.blockResults = cSizes;
+
+        cctxprep.cctx = cctx;
+        cctxprep.dictBuffer = dictBuffer;
+        cctxprep.dictBufferSize = dictBufferSize;
+        cctxprep.cLevel = cLevel;
+        cctxprep.comprParams = comprParams;
+        cctxprep.adv = adv;
+
+        dbp.benchFn = local_defaultDecompress;
+        dbp.benchPayload = dctx;
+        dbp.initFn = local_initDCtx;
+        dbp.initPayload = &dctxprep;
+        dbp.errorFn = ZSTD_isError;
+        dbp.blockCount = nbBlocks;
+        dbp.srcBuffers = (const void* const *) cPtrs;
+        dbp.srcSizes = cSizes;
+        dbp.dstBuffers = resPtrs;
+        dbp.dstCapacities = resSizes;
+        dbp.blockResults = NULL;
+
+        dctxprep.dctx = dctx;
+        dctxprep.dictBuffer = dictBuffer;
+        dctxprep.dictBufferSize = dictBufferSize;
+
+        OUTPUTLEVEL(2, "\r%70s\r", "");   /* blank line */
+        assert(srcSize < UINT_MAX);
+        OUTPUTLEVEL(2, "%2s-%-17.17s :%10u -> \r", marks[markNb], displayName, (unsigned)srcSize);
+
+        while (!(compressionCompleted && decompressionCompleted)) {
+            if (!compressionCompleted) {
+                BMK_runOutcome_t const cOutcome = BMK_benchTimedFn( timeStateCompress, cbp);
+
+                if (!BMK_isSuccessful_runOutcome(cOutcome)) {
+                    RETURN_ERROR(30, BMK_benchOutcome_t, "compression error");
+                }
+
+                {   BMK_runTime_t const cResult = BMK_extract_runTime(cOutcome);
+                    cSize = cResult.sumOfReturn;
+                    ratio = (double)srcSize / (double)cSize;
+                    {   BMK_benchResult_t newResult;
+                        newResult.cSpeed = (U64)((double)srcSize * TIMELOOP_NANOSEC / cResult.nanoSecPerRun);
+                        benchResult.cSize = cSize;
+                        if (newResult.cSpeed > benchResult.cSpeed)
+                            benchResult.cSpeed = newResult.cSpeed;
+                }   }
+
+                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
+                    assert(cSize < UINT_MAX);
+                    OUTPUTLEVEL(2, "%2s-%-17.17s :%10u ->%10u (x%5.*f), %6.*f MB/s \r",
+                            marks[markNb], displayName,
+                            (unsigned)srcSize, (unsigned)cSize,
+                            ratioAccuracy, ratio,
+                            benchResult.cSpeed < (10 * MB_UNIT) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT);
+                }
+                compressionCompleted = BMK_isCompleted_TimedFn(timeStateCompress);
+            }
+
+            if(!decompressionCompleted) {
+                BMK_runOutcome_t const dOutcome = BMK_benchTimedFn(timeStateDecompress, dbp);
+
+                if(!BMK_isSuccessful_runOutcome(dOutcome)) {
+                    RETURN_ERROR(30, BMK_benchOutcome_t, "decompression error");
+                }
+
+                {   BMK_runTime_t const dResult = BMK_extract_runTime(dOutcome);
+                    U64 const newDSpeed = (U64)((double)srcSize * TIMELOOP_NANOSEC / dResult.nanoSecPerRun);
+                    if (newDSpeed > benchResult.dSpeed)
+                        benchResult.dSpeed = newDSpeed;
+                }
+
+                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
+                    OUTPUTLEVEL(2, "%2s-%-17.17s :%10u ->%10u (x%5.*f), %6.*f MB/s, %6.1f MB/s\r",
+                            marks[markNb], displayName,
+                            (unsigned)srcSize, (unsigned)cSize,
+                            ratioAccuracy, ratio,
+                            benchResult.cSpeed < (10 * MB_UNIT) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT,
+                            (double)benchResult.dSpeed / MB_UNIT);
+                }
+                decompressionCompleted = BMK_isCompleted_TimedFn(timeStateDecompress);
+            }
+            markNb = (markNb+1) % NB_MARKS;
+        }   /* while (!(compressionCompleted && decompressionCompleted)) */
+
+        /* CRC Checking */
+        {   const BYTE* resultBuffer = (const BYTE*)(*resultBufferPtr);
+            U64 const crcCheck = XXH64(resultBuffer, srcSize, 0);
+            if ((adv->mode == BMK_both) && (crcOrig!=crcCheck)) {
+                size_t u;
+                DISPLAY("!!! WARNING !!! %14s : Invalid Checksum : %x != %x   \n",
+                        displayName, (unsigned)crcOrig, (unsigned)crcCheck);
+                for (u=0; u<srcSize; u++) {
+                    if (((const BYTE*)srcBuffer)[u] != resultBuffer[u]) {
+                        unsigned segNb, bNb, pos;
+                        size_t bacc = 0;
+                        DISPLAY("Decoding error at pos %u ", (unsigned)u);
+                        for (segNb = 0; segNb < nbBlocks; segNb++) {
+                            if (bacc + srcSizes[segNb] > u) break;
+                            bacc += srcSizes[segNb];
+                        }
+                        pos = (U32)(u - bacc);
+                        bNb = pos / (128 KB);
+                        DISPLAY("(sample %u, block %u, pos %u) \n", segNb, bNb, pos);
+                        {   size_t const lowest = (u>5) ? 5 : u;
+                            size_t n;
+                            DISPLAY("origin: ");
+                            for (n=lowest; n>0; n--)
+                                DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u-n]);
+                            DISPLAY(" :%02X:  ", ((const BYTE*)srcBuffer)[u]);
+                            for (n=1; n<3; n++)
+                                DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u+n]);
+                            DISPLAY(" \n");
+                            DISPLAY("decode: ");
+                            for (n=lowest; n>0; n--)
+                                DISPLAY("%02X ", resultBuffer[u-n]);
+                            DISPLAY(" :%02X:  ", resultBuffer[u]);
+                            for (n=1; n<3; n++)
+                                DISPLAY("%02X ", resultBuffer[u+n]);
+                            DISPLAY(" \n");
+                        }
+                        break;
+                    }
+                    if (u==srcSize-1) {  /* should never happen */
+                        DISPLAY("no difference detected\n");
+                    }
+                }   /* for (u=0; u<srcSize; u++) */
+            }   /* if ((adv->mode == BMK_both) && (crcOrig!=crcCheck)) */
+        }   /* CRC Checking */
+
+        if (displayLevel == 1) {   /* hidden display mode -q, used by python speed benchmark */
+            double const cSpeed = (double)benchResult.cSpeed / MB_UNIT;
+            double const dSpeed = (double)benchResult.dSpeed / MB_UNIT;
+            if (adv->additionalParam) {
+                OUTPUT("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s (param=%d)\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName, adv->additionalParam);
+            } else {
+                OUTPUT("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName);
+            }
+        }
+
+        OUTPUTLEVEL(2, "%2i#\n", cLevel);
+    }   /* Bench */
+
+    benchResult.cMem = (1ULL << (comprParams->windowLog)) + ZSTD_sizeof_CCtx(cctx);
+    return BMK_benchOutcome_setValidResult(benchResult);
+}
+
+BMK_benchOutcome_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
+                        void* dstBuffer, size_t dstCapacity,
+                        const size_t* fileSizes, unsigned nbFiles,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
+                        const void* dictBuffer, size_t dictBufferSize,
+                        int displayLevel, const char* displayName, const BMK_advancedParams_t* adv)
+
+{
+    int const dstParamsError = !dstBuffer ^ !dstCapacity;  /* must be both NULL or none */
+
+    size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize) /* avoid div by 0 */ ;
+    U32 const maxNbBlocks = (U32) ((srcSize + (blockSize-1)) / blockSize) + nbFiles;
+
+    /* these are the blockTable parameters, just split up */
+    const void ** const srcPtrs = (const void**)malloc(maxNbBlocks * sizeof(void*));
+    size_t* const srcSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+
+    void ** const cPtrs = (void**)malloc(maxNbBlocks * sizeof(void*));
+    size_t* const cSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+    size_t* const cCapacities = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+    void ** const resPtrs = (void**)malloc(maxNbBlocks * sizeof(void*));
+    size_t* const resSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+    BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
+    BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
+
+    ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+    ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+
+    const size_t maxCompressedSize = dstCapacity ? dstCapacity : ZSTD_compressBound(srcSize) + (maxNbBlocks * 1024);
+
+    void* const internalDstBuffer = dstBuffer ? NULL : malloc(maxCompressedSize);
+    void* const compressedBuffer = dstBuffer ? dstBuffer : internalDstBuffer;
+
+    BMK_benchOutcome_t outcome = BMK_benchOutcome_error();  /* error by default */
+
+    void* resultBuffer = srcSize ? malloc(srcSize) : NULL;
+
+    int const allocationincomplete = !srcPtrs || !srcSizes || !cPtrs ||
+        !cSizes || !cCapacities || !resPtrs || !resSizes ||
+        !timeStateCompress || !timeStateDecompress ||
+        !cctx || !dctx ||
+        !compressedBuffer || !resultBuffer;
+
+
+    if (!allocationincomplete && !dstParamsError) {
+        outcome = BMK_benchMemAdvancedNoAlloc(srcPtrs, srcSizes,
+                                            cPtrs, cCapacities, cSizes,
+                                            resPtrs, resSizes,
+                                            &resultBuffer,
+                                            compressedBuffer, maxCompressedSize,
+                                            timeStateCompress, timeStateDecompress,
+                                            srcBuffer, srcSize,
+                                            fileSizes, nbFiles,
+                                            cLevel, comprParams,
+                                            dictBuffer, dictBufferSize,
+                                            cctx, dctx,
+                                            displayLevel, displayName, adv);
+    }
+
+    /* clean up */
+    BMK_freeTimedFnState(timeStateCompress);
+    BMK_freeTimedFnState(timeStateDecompress);
+
+    ZSTD_freeCCtx(cctx);
+    ZSTD_freeDCtx(dctx);
+
+    free(internalDstBuffer);
+    free(resultBuffer);
+
+    free((void*)srcPtrs);
+    free(srcSizes);
+    free(cPtrs);
+    free(cSizes);
+    free(cCapacities);
+    free(resPtrs);
+    free(resSizes);
+
+    if(allocationincomplete) {
+        RETURN_ERROR(31, BMK_benchOutcome_t, "allocation error : not enough memory");
+    }
+
+    if(dstParamsError) {
+        RETURN_ERROR(32, BMK_benchOutcome_t, "Dst parameters not coherent");
+    }
+    return outcome;
+}
+
+BMK_benchOutcome_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
+                        const size_t* fileSizes, unsigned nbFiles,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
+                        const void* dictBuffer, size_t dictBufferSize,
+                        int displayLevel, const char* displayName) {
+
+    BMK_advancedParams_t const adv = BMK_initAdvancedParams();
+    return BMK_benchMemAdvanced(srcBuffer, srcSize,
+                                NULL, 0,
+                                fileSizes, nbFiles,
+                                cLevel, comprParams,
+                                dictBuffer, dictBufferSize,
+                                displayLevel, displayName, &adv);
+}
+
+static BMK_benchOutcome_t BMK_benchCLevel(const void* srcBuffer, size_t benchedSize,
+                            const size_t* fileSizes, unsigned nbFiles,
+                            int cLevel, const ZSTD_compressionParameters* comprParams,
+                            const void* dictBuffer, size_t dictBufferSize,
+                            int displayLevel, const char* displayName,
+                            BMK_advancedParams_t const * const adv)
+{
+    const char* pch = strrchr(displayName, '\\'); /* Windows */
+    if (!pch) pch = strrchr(displayName, '/');    /* Linux */
+    if (pch) displayName = pch+1;
+
+    if (adv->realTime) {
+        DISPLAYLEVEL(2, "Note : switching to real-time priority \n");
+        SET_REALTIME_PRIORITY;
+    }
+
+    if (displayLevel == 1 && !adv->additionalParam)   /* --quiet mode */
+        OUTPUT("bench %s %s: input %u bytes, %u seconds, %u KB blocks\n",
+                ZSTD_VERSION_STRING, ZSTD_GIT_COMMIT_STRING,
+                (unsigned)benchedSize, adv->nbSeconds, (unsigned)(adv->blockSize>>10));
+
+    return BMK_benchMemAdvanced(srcBuffer, benchedSize,
+                                NULL, 0,
+                                fileSizes, nbFiles,
+                                cLevel, comprParams,
+                                dictBuffer, dictBufferSize,
+                                displayLevel, displayName, adv);
+}
+
+int BMK_syntheticTest(int cLevel, double compressibility,
+                      const ZSTD_compressionParameters* compressionParams,
+                      int displayLevel, const BMK_advancedParams_t* adv)
+{
+    char name[20] = {0};
+    size_t const benchedSize = 10000000;
+    void* srcBuffer;
+    BMK_benchOutcome_t res;
+
+    if (cLevel > ZSTD_maxCLevel()) {
+        DISPLAYLEVEL(1, "Invalid Compression Level");
+        return 15;
+    }
+
+    /* Memory allocation */
+    srcBuffer = malloc(benchedSize);
+    if (!srcBuffer) {
+        DISPLAYLEVEL(1, "allocation error : not enough memory");
+        return 16;
+    }
+
+    /* Fill input buffer */
+    RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
+
+    /* Bench */
+    snprintf (name, sizeof(name), "Synthetic %2u%%", (unsigned)(compressibility*100));
+    res = BMK_benchCLevel(srcBuffer, benchedSize,
+                    &benchedSize /* ? */, 1 /* ? */,
+                    cLevel, compressionParams,
+                    NULL, 0,  /* dictionary */
+                    displayLevel, name, adv);
+
+    /* clean up */
+    free(srcBuffer);
+
+    return !BMK_isSuccessful_benchOutcome(res);
+}
+
+
+
+static size_t BMK_findMaxMem(U64 requiredMem)
+{
+    size_t const step = 64 MB;
+    BYTE* testmem = NULL;
+
+    requiredMem = (((requiredMem >> 26) + 1) << 26);
+    requiredMem += step;
+    if (requiredMem > maxMemory) requiredMem = maxMemory;
+
+    do {
+        testmem = (BYTE*)malloc((size_t)requiredMem);
+        requiredMem -= step;
+    } while (!testmem && requiredMem > 0);
+
+    free(testmem);
+    return (size_t)(requiredMem);
+}
+
+/*! BMK_loadFiles() :
+ *  Loads `buffer` with content of files listed within `fileNamesTable`.
+ *  At most, fills `buffer` entirely. */
+static int BMK_loadFiles(void* buffer, size_t bufferSize,
+                         size_t* fileSizes,
+                         const char* const * fileNamesTable, unsigned nbFiles,
+                         int displayLevel)
+{
+    size_t pos = 0, totalSize = 0;
+    unsigned n;
+    for (n=0; n<nbFiles; n++) {
+        U64 fileSize = UTIL_getFileSize(fileNamesTable[n]);  /* last file may be shortened */
+        if (UTIL_isDirectory(fileNamesTable[n])) {
+            DISPLAYLEVEL(2, "Ignoring %s directory...       \n", fileNamesTable[n]);
+            fileSizes[n] = 0;
+            continue;
+        }
+        if (fileSize == UTIL_FILESIZE_UNKNOWN) {
+            DISPLAYLEVEL(2, "Cannot evaluate size of %s, ignoring ... \n", fileNamesTable[n]);
+            fileSizes[n] = 0;
+            continue;
+        }
+        {   FILE* const f = fopen(fileNamesTable[n], "rb");
+            if (f==NULL) RETURN_ERROR_INT(10, "impossible to open file %s", fileNamesTable[n]);
+            OUTPUTLEVEL(2, "Loading %s...       \r", fileNamesTable[n]);
+            if (fileSize > bufferSize-pos) fileSize = bufferSize-pos, nbFiles=n;   /* buffer too small - stop after this file */
+            {   size_t const readSize = fread(((char*)buffer)+pos, 1, (size_t)fileSize, f);
+                if (readSize != (size_t)fileSize) RETURN_ERROR_INT(11, "could not read %s", fileNamesTable[n]);
+                pos += readSize;
+            }
+            fileSizes[n] = (size_t)fileSize;
+            totalSize += (size_t)fileSize;
+            fclose(f);
+    }   }
+
+    if (totalSize == 0) RETURN_ERROR_INT(12, "no data to bench");
+    return 0;
+}
+
+int BMK_benchFilesAdvanced(
+                        const char* const * fileNamesTable, unsigned nbFiles,
+                        const char* dictFileName, int cLevel,
+                        const ZSTD_compressionParameters* compressionParams,
+                        int displayLevel, const BMK_advancedParams_t* adv)
+{
+    void* srcBuffer = NULL;
+    size_t benchedSize;
+    void* dictBuffer = NULL;
+    size_t dictBufferSize = 0;
+    size_t* fileSizes = NULL;
+    BMK_benchOutcome_t res;
+    U64 const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
+
+    if (!nbFiles) {
+        DISPLAYLEVEL(1, "No Files to Benchmark");
+        return 13;
+    }
+
+    if (cLevel > ZSTD_maxCLevel()) {
+        DISPLAYLEVEL(1, "Invalid Compression Level");
+        return 14;
+    }
+
+    if (totalSizeToLoad == UTIL_FILESIZE_UNKNOWN) {
+        DISPLAYLEVEL(1, "Error loading files");
+        return 15;
+    }
+
+    fileSizes = (size_t*)calloc(nbFiles, sizeof(size_t));
+    if (!fileSizes) {
+        DISPLAYLEVEL(1, "not enough memory for fileSizes");
+        return 16;
+    }
+
+    /* Load dictionary */
+    if (dictFileName != NULL) {
+        U64 const dictFileSize = UTIL_getFileSize(dictFileName);
+        if (dictFileSize == UTIL_FILESIZE_UNKNOWN) {
+            DISPLAYLEVEL(1, "error loading %s : %s \n", dictFileName, strerror(errno));
+            free(fileSizes);
+            DISPLAYLEVEL(1, "benchmark aborted");
+            return 17;
+        }
+        if (dictFileSize > 64 MB) {
+            free(fileSizes);
+            DISPLAYLEVEL(1, "dictionary file %s too large", dictFileName);
+            return 18;
+        }
+        dictBufferSize = (size_t)dictFileSize;
+        dictBuffer = malloc(dictBufferSize);
+        if (dictBuffer==NULL) {
+            free(fileSizes);
+            DISPLAYLEVEL(1, "not enough memory for dictionary (%u bytes)",
+                            (unsigned)dictBufferSize);
+            return 19;
+        }
+
+        {   int const errorCode = BMK_loadFiles(dictBuffer, dictBufferSize,
+                                                fileSizes, &dictFileName /*?*/,
+                                                1 /*?*/, displayLevel);
+            if (errorCode) {
+                res = BMK_benchOutcome_error();
+                goto _cleanUp;
+        }   }
+    }
+
+    /* Memory allocation & restrictions */
+    benchedSize = BMK_findMaxMem(totalSizeToLoad * 3) / 3;
+    if ((U64)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
+    if (benchedSize < totalSizeToLoad)
+        DISPLAY("Not enough memory; testing %u MB only...\n", (unsigned)(benchedSize >> 20));
+
+    srcBuffer = benchedSize ? malloc(benchedSize) : NULL;
+    if (!srcBuffer) {
+        free(dictBuffer);
+        free(fileSizes);
+        DISPLAYLEVEL(1, "not enough memory for srcBuffer");
+        return 20;
+    }
+
+    /* Load input buffer */
+    {   int const errorCode = BMK_loadFiles(srcBuffer, benchedSize,
+                                        fileSizes, fileNamesTable, nbFiles,
+                                        displayLevel);
+        if (errorCode) {
+            res = BMK_benchOutcome_error();
+            goto _cleanUp;
+    }   }
+
+    /* Bench */
+    {   char mfName[20] = {0};
+        snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
+        {   const char* const displayName = (nbFiles > 1) ? mfName : fileNamesTable[0];
+            res = BMK_benchCLevel(srcBuffer, benchedSize,
+                                fileSizes, nbFiles,
+                                cLevel, compressionParams,
+                                dictBuffer, dictBufferSize,
+                                displayLevel, displayName,
+                                adv);
+    }   }
+
+_cleanUp:
+    free(srcBuffer);
+    free(dictBuffer);
+    free(fileSizes);
+    return !BMK_isSuccessful_benchOutcome(res);
+}
+
+
+int BMK_benchFiles(const char* const * fileNamesTable, unsigned nbFiles,
+                    const char* dictFileName,
+                    int cLevel, const ZSTD_compressionParameters* compressionParams,
+                    int displayLevel)
+{
+    BMK_advancedParams_t const adv = BMK_initAdvancedParams();
+    return BMK_benchFilesAdvanced(fileNamesTable, nbFiles, dictFileName, cLevel, compressionParams, displayLevel, &adv);
+}
--- a/third_party/zstd/programs/benchzstd.h
+++ b/third_party/zstd/programs/benchzstd.h
@ -0,0 +1,197 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /* benchzstd :
+  * benchmark Zstandard compression / decompression
+  * over a set of files or buffers
+  * and display progress result and final summary
+  */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef BENCH_ZSTD_H_3242387
+#define BENCH_ZSTD_H_3242387
+
+/* ===  Dependencies  === */
+   /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_compressionParameters */
+#include "third_party/zstd/zstd.h"     /* ZSTD_compressionParameters */
+
+
+/* ===  Constants  === */
+
+#define MB_UNIT 1000000
+
+
+/* ===  Benchmark functions  === */
+
+/* Creates a variant `typeName`, able to express "error or valid result".
+ * Functions with return type `typeName`
+ * must first check if result is valid, using BMK_isSuccessful_*(),
+ * and only then can extract `baseType`.
+ */
+#define VARIANT_ERROR_RESULT(baseType, variantName)  \
+                                             \
+typedef struct {                             \
+    baseType internal_never_use_directly;    \
+    int tag;                                 \
+} variantName
+
+
+typedef struct {
+    size_t cSize;
+    unsigned long long cSpeed;   /* bytes / sec */
+    unsigned long long dSpeed;
+    size_t cMem;                 /* memory usage during compression */
+} BMK_benchResult_t;
+
+VARIANT_ERROR_RESULT(BMK_benchResult_t, BMK_benchOutcome_t);
+
+/* check first if the return structure represents an error or a valid result */
+int BMK_isSuccessful_benchOutcome(BMK_benchOutcome_t outcome);
+
+/* extract result from variant type.
+ * note : this function will abort() program execution if result is not valid
+ *        check result validity first, by using BMK_isSuccessful_benchOutcome()
+ */
+BMK_benchResult_t BMK_extract_benchResult(BMK_benchOutcome_t outcome);
+
+
+/*! BMK_benchFiles() -- called by zstdcli */
+/*  Loads files from fileNamesTable into memory,
+ *  and an optional dictionary from dictFileName (can be NULL),
+ *  then uses benchMem().
+ *  fileNamesTable - name of files to benchmark.
+ *  nbFiles - number of files (size of fileNamesTable), must be > 0.
+ *  dictFileName - name of dictionary file to load.
+ *  cLevel - compression level to benchmark, errors if invalid.
+ *  compressionParams - advanced compression Parameters.
+ *  displayLevel - what gets printed:
+ *      0 : no display;
+ *      1 : errors;
+ *      2 : + result + interaction + warnings;
+ *      3 : + information;
+ *      4 : + debug
+ * @return: 0 on success, !0 on error
+ */
+int BMK_benchFiles(
+            const char* const * fileNamesTable, unsigned nbFiles,
+            const char* dictFileName,
+            int cLevel, const ZSTD_compressionParameters* compressionParams,
+            int displayLevel);
+
+
+typedef enum {
+    BMK_both = 0,
+    BMK_decodeOnly = 1,
+    BMK_compressOnly = 2
+} BMK_mode_t;
+
+typedef struct {
+    BMK_mode_t mode;        /* 0: all, 1: compress only 2: decode only */
+    unsigned nbSeconds;     /* default timing is in nbSeconds */
+    size_t blockSize;       /* Maximum size of each block*/
+    int nbWorkers;          /* multithreading */
+    unsigned realTime;      /* real time priority */
+    int additionalParam;    /* used by python speed benchmark */
+    int ldmFlag;            /* enables long distance matching */
+    int ldmMinMatch;        /* below: parameters for long distance matching, see zstd.1.md */
+    int ldmHashLog;
+    int ldmBucketSizeLog;
+    int ldmHashRateLog;
+    ZSTD_paramSwitch_e literalCompressionMode;
+    int useRowMatchFinder;  /* use row-based matchfinder if possible */
+} BMK_advancedParams_t;
+
+/* returns default parameters used by nonAdvanced functions */
+BMK_advancedParams_t BMK_initAdvancedParams(void);
+
+/*! BMK_benchFilesAdvanced():
+ *  Same as BMK_benchFiles(),
+ *  with more controls, provided through advancedParams_t structure */
+int BMK_benchFilesAdvanced(
+               const char* const * fileNamesTable, unsigned nbFiles,
+               const char* dictFileName,
+               int cLevel, const ZSTD_compressionParameters* compressionParams,
+               int displayLevel, const BMK_advancedParams_t* adv);
+
+/*! BMK_syntheticTest() -- called from zstdcli */
+/*  Generates a sample with datagen, using compressibility argument */
+/*  cLevel - compression level to benchmark, errors if invalid
+ *  compressibility - determines compressibility of sample
+ *  compressionParams - basic compression Parameters
+ *  displayLevel - see benchFiles
+ *  adv - see advanced_Params_t
+ * @return: 0 on success, !0 on error
+ */
+int BMK_syntheticTest(int cLevel, double compressibility,
+                      const ZSTD_compressionParameters* compressionParams,
+                      int displayLevel, const BMK_advancedParams_t* adv);
+
+
+
+/* ===  Benchmark Zstandard in a memory-to-memory scenario  === */
+
+/** BMK_benchMem() -- core benchmarking function, called in paramgrill
+ *  applies ZSTD_compress_generic() and ZSTD_decompress_generic() on data in srcBuffer
+ *  with specific compression parameters provided by other arguments using benchFunction
+ *  (cLevel, comprParams + adv in advanced Mode) */
+/*  srcBuffer - data source, expected to be valid compressed data if in Decode Only Mode
+ *  srcSize - size of data in srcBuffer
+ *  fileSizes - srcBuffer is considered cut into 1+ segments, to compress separately.
+ *              note : sum(fileSizes) must be == srcSize.  (<== ensure it's properly checked)
+ *  nbFiles - nb of segments
+ *  cLevel - compression level
+ *  comprParams - basic compression parameters
+ *  dictBuffer - a dictionary if used, null otherwise
+ *  dictBufferSize - size of dictBuffer, 0 otherwise
+ *  displayLevel - see BMK_benchFiles
+ *  displayName - name used by display
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
+ */
+BMK_benchOutcome_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
+                        const size_t* fileSizes, unsigned nbFiles,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
+                        const void* dictBuffer, size_t dictBufferSize,
+                        int displayLevel, const char* displayName);
+
+
+/* BMK_benchMemAdvanced() : used by Paramgrill
+ * same as BMK_benchMem() with following additional options :
+ * dstBuffer - destination buffer to write compressed output in, NULL if none provided.
+ * dstCapacity - capacity of destination buffer, give 0 if dstBuffer = NULL
+ * adv = see advancedParams_t
+ */
+BMK_benchOutcome_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
+                        void* dstBuffer, size_t dstCapacity,
+                        const size_t* fileSizes, unsigned nbFiles,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
+                        const void* dictBuffer, size_t dictBufferSize,
+                        int displayLevel, const char* displayName,
+                        const BMK_advancedParams_t* adv);
+
+
+
+#endif   /* BENCH_ZSTD_H_3242387 */
+
+#if defined (__cplusplus)
+}
+#endif
--- a/third_party/zstd/programs/datagen.c
+++ b/third_party/zstd/programs/datagen.c
@ -0,0 +1,210 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/*-************************************
+*  Dependencies
+**************************************/
+#include "third_party/zstd/programs/datagen.h"
+#include "third_party/zstd/programs/platform.h"  /* SET_BINARY_MODE */
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"    /* malloc, free */
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"     /* FILE, fwrite, fprintf */
+#include "libc/mem/alg.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h"    /* memcpy */
+#include "third_party/zstd/lib/common/mem.h"  /* U32 */
+
+
+/*-************************************
+*  Macros
+**************************************/
+#define KB *(1 <<10)
+#define MIN(a,b)  ( (a) < (b) ? (a) : (b) )
+
+#define RDG_DEBUG 0
+#define TRACE(...)   if (RDG_DEBUG) fprintf(stderr, __VA_ARGS__ )
+
+
+/*-************************************
+*  Local constants
+**************************************/
+#define LTLOG 13
+#define LTSIZE (1<<LTLOG)
+#define LTMASK (LTSIZE-1)
+
+
+/*-*******************************************************
+*  Local Functions
+*********************************************************/
+#define RDG_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static U32 RDG_rand(U32* src)
+{
+    static const U32 prime1 = 2654435761U;
+    static const U32 prime2 = 2246822519U;
+    U32 rand32 = *src;
+    rand32 *= prime1;
+    rand32 ^= prime2;
+    rand32  = RDG_rotl32(rand32, 13);
+    *src = rand32;
+    return rand32 >> 5;
+}
+
+typedef U32 fixedPoint_24_8;
+
+static void RDG_fillLiteralDistrib(BYTE* ldt, fixedPoint_24_8 ld)
+{
+    BYTE const firstChar = (ld<=0.0) ?   0 : '(';
+    BYTE const lastChar  = (ld<=0.0) ? 255 : '}';
+    BYTE character = (ld<=0.0) ? 0 : '0';
+    U32 u;
+
+    if (ld<=0) ld = 0;
+    for (u=0; u<LTSIZE; ) {
+        U32 const weight = (((LTSIZE - u) * ld) >> 8) + 1;
+        U32 const end = MIN ( u + weight , LTSIZE);
+        while (u < end) ldt[u++] = character;
+        character++;
+        if (character > lastChar) character = firstChar;
+    }
+}
+
+
+static BYTE RDG_genChar(U32* seed, const BYTE* ldt)
+{
+    U32 const id = RDG_rand(seed) & LTMASK;
+    return ldt[id];  /* memory-sanitizer fails here, stating "uninitialized value" when table initialized with P==0.0. Checked : table is fully initialized */
+}
+
+
+static U32 RDG_rand15Bits (U32* seedPtr)
+{
+    return RDG_rand(seedPtr) & 0x7FFF;
+}
+
+static U32 RDG_randLength(U32* seedPtr)
+{
+    if (RDG_rand(seedPtr) & 7) return (RDG_rand(seedPtr) & 0xF);   /* small length */
+    return (RDG_rand(seedPtr) & 0x1FF) + 0xF;
+}
+
+static void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize,
+                         double matchProba, const BYTE* ldt, U32* seedPtr)
+{
+    BYTE* const buffPtr = (BYTE*)buffer;
+    U32 const matchProba32 = (U32)(32768 * matchProba);
+    size_t pos = prefixSize;
+    U32 prevOffset = 1;
+
+    /* special case : sparse content */
+    while (matchProba >= 1.0) {
+        size_t size0 = RDG_rand(seedPtr) & 3;
+        size0  = (size_t)1 << (16 + size0 * 2);
+        size0 += RDG_rand(seedPtr) & (size0-1);   /* because size0 is power of 2*/
+        if (buffSize < pos + size0) {
+            memset(buffPtr+pos, 0, buffSize-pos);
+            return;
+        }
+        memset(buffPtr+pos, 0, size0);
+        pos += size0;
+        buffPtr[pos-1] = RDG_genChar(seedPtr, ldt);
+        continue;
+    }
+
+    /* init */
+    if (pos==0) buffPtr[0] = RDG_genChar(seedPtr, ldt), pos=1;
+
+    /* Generate compressible data */
+    while (pos < buffSize) {
+        /* Select : Literal (char) or Match (within 32K) */
+        if (RDG_rand15Bits(seedPtr) < matchProba32) {
+            /* Copy (within 32K) */
+            U32 const length = RDG_randLength(seedPtr) + 4;
+            U32 const d = (U32) MIN(pos + length , buffSize);
+            U32 const repeatOffset = (RDG_rand(seedPtr) & 15) == 2;
+            U32 const randOffset = RDG_rand15Bits(seedPtr) + 1;
+            U32 const offset = repeatOffset ? prevOffset : (U32) MIN(randOffset , pos);
+            size_t match = pos - offset;
+            while (pos < d) { buffPtr[pos++] = buffPtr[match++];   /* correctly manages overlaps */ }
+            prevOffset = offset;
+        } else {
+            /* Literal (noise) */
+            U32 const length = RDG_randLength(seedPtr);
+            U32 const d = (U32) MIN(pos + length, buffSize);
+            while (pos < d) { buffPtr[pos++] = RDG_genChar(seedPtr, ldt); }
+    }   }
+}
+
+
+void RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba, unsigned seed)
+{
+    U32 seed32 = seed;
+    BYTE ldt[LTSIZE];
+    memset(ldt, '0', sizeof(ldt));  /* yes, character '0', this is intentional */
+    if (litProba<=0.0) litProba = matchProba / 4.5;
+    RDG_fillLiteralDistrib(ldt, (fixedPoint_24_8)(litProba * 256 + 0.001));
+    RDG_genBlock(buffer, size, 0, matchProba, ldt, &seed32);
+}
+
+
+void RDG_genStdout(unsigned long long size, double matchProba, double litProba, unsigned seed)
+{
+    U32 seed32 = seed;
+    size_t const stdBlockSize = 128 KB;
+    size_t const stdDictSize = 32 KB;
+    BYTE* const buff = (BYTE*)malloc(stdDictSize + stdBlockSize);
+    U64 total = 0;
+    BYTE ldt[LTSIZE];   /* literals distribution table */
+
+    /* init */
+    if (buff==NULL) { perror("datagen"); exit(1); }
+    if (litProba<=0.0) litProba = matchProba / 4.5;
+    memset(ldt, '0', sizeof(ldt));   /* yes, character '0', this is intentional */
+    RDG_fillLiteralDistrib(ldt, (fixedPoint_24_8)(litProba * 256 + 0.001));
+    SET_BINARY_MODE(stdout);
+
+    /* Generate initial dict */
+    RDG_genBlock(buff, stdDictSize, 0, matchProba, ldt, &seed32);
+
+    /* Generate compressible data */
+    while (total < size) {
+        size_t const genBlockSize = (size_t) (MIN (stdBlockSize, size-total));
+        RDG_genBlock(buff, stdDictSize+stdBlockSize, stdDictSize, matchProba, ldt, &seed32);
+        total += genBlockSize;
+        { size_t const unused = fwrite(buff, 1, genBlockSize, stdout); (void)unused; }
+        /* update dict */
+        memcpy(buff, buff + stdBlockSize, stdDictSize);
+    }
+
+    /* cleanup */
+    free(buff);
+}
--- a/third_party/zstd/programs/datagen.h
+++ b/third_party/zstd/programs/datagen.h
@ -0,0 +1,31 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#ifndef DATAGEN_H
+#define DATAGEN_H
+
+   /* size_t */
+
+void RDG_genStdout(unsigned long long size, double matchProba, double litProba, unsigned seed);
+void RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba, unsigned seed);
+/*!RDG_genBuffer
+   Generate 'size' bytes of compressible data into 'buffer'.
+   Compressibility can be controlled using 'matchProba', which is floating point value between 0 and 1.
+   'LitProba' is optional, it affect variability of individual bytes. If litProba==0.0, default value will be used.
+   Generated data pattern can be modified using different 'seed'.
+   For a triplet (matchProba, litProba, seed), the function always generate the same content.
+
+   RDG_genStdout
+   Same as RDG_genBuffer, but generates data into stdout
+*/
+
+#endif
--- a/third_party/zstd/programs/dibio.c
+++ b/third_party/zstd/programs/dibio.c
@ -0,0 +1,465 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/* **************************************
+*  Compiler Warnings
+****************************************/
+#ifdef _MSC_VER
+#  pragma warning(disable : 4127)    /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/*-*************************************
+*  Includes
+***************************************/
+#include "third_party/zstd/programs/platform.h"       /* Large Files support */
+#include "third_party/zstd/programs/util.h"           /* UTIL_getFileSize, UTIL_getTotalFileSize */
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/assert.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"         /* malloc, free */
+#include "libc/mem/alg.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h"         /* memset */
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"          /* fprintf, fopen, ftello64 */
+#include "libc/errno.h"          /* errno */
+
+#include "third_party/zstd/programs/timefn.h"         /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
+#include "third_party/zstd/lib/common/debug.h" /* assert */
+#include "third_party/zstd/lib/common/mem.h"  /* read */
+#include "third_party/zstd/zstd_errors.h"
+#include "third_party/zstd/programs/dibio.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define SAMPLESIZE_MAX (128 KB)
+#define MEMMULT 11    /* rough estimation : memory cost to analyze 1 byte of sample */
+#define COVER_MEMMULT 9    /* rough estimation : memory cost to analyze 1 byte of sample */
+#define FASTCOVER_MEMMULT 1    /* rough estimation : memory cost to analyze 1 byte of sample */
+static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
+
+#define NOISELENGTH 32
+#define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
+
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+
+/* ********************************************************
+*  Helper functions
+**********************************************************/
+#undef MIN
+#define MIN(a,b)    ((a) < (b) ? (a) : (b))
+
+/**
+  Returns the size of a file.
+  If error returns -1.
+*/
+static S64 DiB_getFileSize (const char * fileName)
+{
+    U64 const fileSize = UTIL_getFileSize(fileName);
+    return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
+}
+
+/* ********************************************************
+*  File related operations
+**********************************************************/
+/** DiB_loadFiles() :
+ *  load samples from files listed in fileNamesTable into buffer.
+ *  works even if buffer is too small to load all samples.
+ *  Also provides the size of each sample into sampleSizes table
+ *  which must be sized correctly, using DiB_fileStats().
+ * @return : nb of samples effectively loaded into `buffer`
+ * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
+ *  sampleSizes is filled with the size of each sample.
+ */
+static int DiB_loadFiles(
+    void* buffer, size_t* bufferSizePtr,
+    size_t* sampleSizes, int sstSize,
+    const char** fileNamesTable, int nbFiles,
+    size_t targetChunkSize, int displayLevel )
+{
+    char* const buff = (char*)buffer;
+    size_t totalDataLoaded = 0;
+    int nbSamplesLoaded = 0;
+    int fileIndex = 0;
+    FILE * f = NULL;
+
+    assert(targetChunkSize <= SAMPLESIZE_MAX);
+
+    while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
+        size_t fileDataLoaded;
+        S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
+        if (fileSize <= 0) {
+            /* skip if zero-size or file error */
+            ++fileIndex;
+            continue;
+        }
+
+        f = fopen( fileNamesTable[fileIndex], "rb");
+        if (f == NULL)
+            EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
+        DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[fileIndex]);
+
+        /* Load the first chunk of data from the file */
+        fileDataLoaded = targetChunkSize > 0 ?
+                            (size_t)MIN(fileSize, (S64)targetChunkSize) :
+                            (size_t)MIN(fileSize, SAMPLESIZE_MAX );
+        if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
+            break;
+        if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
+            EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
+        sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
+        totalDataLoaded += fileDataLoaded;
+
+        /* If file-chunking is enabled, load the rest of the file as more samples */
+        if (targetChunkSize > 0) {
+            while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
+                size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
+                if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
+                    break;
+
+                if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
+                    EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
+                sampleSizes[nbSamplesLoaded++] = chunkSize;
+                totalDataLoaded += chunkSize;
+                fileDataLoaded += chunkSize;
+            }
+        }
+        fileIndex += 1;
+        fclose(f); f = NULL;
+    }
+    if (f != NULL)
+        fclose(f);
+
+    DISPLAYLEVEL(2, "\r%79s\r", "");
+    DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
+        (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
+    *bufferSizePtr = totalDataLoaded;
+    return nbSamplesLoaded;
+}
+
+#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static U32 DiB_rand(U32* src)
+{
+    static const U32 prime1 = 2654435761U;
+    static const U32 prime2 = 2246822519U;
+    U32 rand32 = *src;
+    rand32 *= prime1;
+    rand32 ^= prime2;
+    rand32  = DiB_rotl32(rand32, 13);
+    *src = rand32;
+    return rand32 >> 5;
+}
+
+/* DiB_shuffle() :
+ * shuffle a table of file names in a semi-random way
+ * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
+ * it will load random elements from it, instead of just the first ones. */
+static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
+    U32 seed = 0xFD2FB528;
+    unsigned i;
+    if (nbFiles == 0)
+        return;
+    for (i = nbFiles - 1; i > 0; --i) {
+        unsigned const j = DiB_rand(&seed) % (i + 1);
+        const char* const tmp = fileNamesTable[j];
+        fileNamesTable[j] = fileNamesTable[i];
+        fileNamesTable[i] = tmp;
+    }
+}
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+static size_t DiB_findMaxMem(unsigned long long requiredMem)
+{
+    size_t const step = 8 MB;
+    void* testmem = NULL;
+
+    requiredMem = (((requiredMem >> 23) + 1) << 23);
+    requiredMem += step;
+    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
+
+    while (!testmem) {
+        testmem = malloc((size_t)requiredMem);
+        requiredMem -= step;
+    }
+
+    free(testmem);
+    return (size_t)requiredMem;
+}
+
+
+static void DiB_fillNoise(void* buffer, size_t length)
+{
+    unsigned const prime1 = 2654435761U;
+    unsigned const prime2 = 2246822519U;
+    unsigned acc = prime1;
+    size_t p=0;
+
+    for (p=0; p<length; p++) {
+        acc *= prime2;
+        ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
+    }
+}
+
+
+static void DiB_saveDict(const char* dictFileName,
+                         const void* buff, size_t buffSize)
+{
+    FILE* const f = fopen(dictFileName, "wb");
+    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
+
+    { size_t const n = fwrite(buff, 1, buffSize, f);
+      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
+
+    { size_t const n = (size_t)fclose(f);
+      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
+}
+
+typedef struct {
+    S64 totalSizeToLoad;
+    int nbSamples;
+    int oneSampleTooLarge;
+} fileStats;
+
+/*! DiB_fileStats() :
+ *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
+ *  provides the amount of data to be loaded and the resulting nb of samples.
+ *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
+ */
+static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
+{
+    fileStats fs;
+    int n;
+    memset(&fs, 0, sizeof(fs));
+
+    /* We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX */
+    assert( chunkSize <= SAMPLESIZE_MAX );
+
+    for (n=0; n<nbFiles; n++) {
+      S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
+      /* TODO: is there a minimum sample size? What if the file is 1-byte? */
+      if (fileSize == 0) {
+        DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
+        continue;
+      }
+
+      /* the case where we are breaking up files in sample chunks */
+      if (chunkSize > 0) {
+        /* TODO: is there a minimum sample size? Can we have a 1-byte sample? */
+        fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
+        fs.totalSizeToLoad += fileSize;
+      }
+      else {
+      /* the case where one file is one sample */
+        if (fileSize > SAMPLESIZE_MAX) {
+          /* flag excessively large sample files */
+          fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);
+
+          /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
+          DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
+              fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
+        }
+        fs.nbSamples += 1;
+        fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
+      }
+    }
+    DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
+    return fs;
+}
+
+int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
+                       const char** fileNamesTable, int nbFiles, size_t chunkSize,
+                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
+{
+    fileStats fs;
+    size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
+    int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
+    size_t loadedSize; /* total data loaded in srcBuffer for all samples */
+    void* srcBuffer /* contiguous buffer with training data/samples */;
+    void* const dictBuffer = malloc(maxDictSize);
+    int result = 0;
+
+    int const displayLevel = params ? params->zParams.notificationLevel :
+        coverParams ? coverParams->zParams.notificationLevel :
+        fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
+
+    /* Shuffle input files before we start assessing how much sample datA to load.
+       The purpose of the shuffle is to pick random samples when the sample
+       set is larger than what we can load in memory. */
+    DISPLAYLEVEL(3, "Shuffling input files\n");
+    DiB_shuffle(fileNamesTable, nbFiles);
+
+    /* Figure out how much sample data to load with how many samples */
+    fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    {
+        int const memMult = params ? MEMMULT :
+                            coverParams ? COVER_MEMMULT:
+                            FASTCOVER_MEMMULT;
+        size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
+        /* Limit the size of the training data to the free memory */
+        /* Limit the size of the training data to 2GB */
+        /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
+        loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
+        if (memLimit != 0) {
+            DISPLAYLEVEL(2, "!  Warning : setting manual memory limit for dictionary training data at %u MB \n",
+                (unsigned)(memLimit / (1 MB)));
+            loadedSize = (size_t)MIN(loadedSize, memLimit);
+        }
+        srcBuffer = malloc(loadedSize+NOISELENGTH);
+        sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
+    }
+
+    /* Checks */
+    if ((fs.nbSamples && !sampleSizes) || (!srcBuffer) || (!dictBuffer))
+        EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
+    if (fs.oneSampleTooLarge) {
+        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
+        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
+        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
+    }
+    if (fs.nbSamples < 5) {
+        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
+        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
+        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
+        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
+    }
+    if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
+        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
+        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
+    }
+
+    /* init */
+    if ((S64)loadedSize < fs.totalSizeToLoad)
+        DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
+            (unsigned)(fs.totalSizeToLoad / (1 MB)),
+            (unsigned)(loadedSize / (1 MB)));
+
+    /* Load input buffer */
+    nbSamplesLoaded = DiB_loadFiles(
+        srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
+        nbFiles, chunkSize, displayLevel);
+
+    {   size_t dictSize = ZSTD_error_GENERIC;
+        if (params) {
+            DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
+            dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
+                                                    srcBuffer, sampleSizes, nbSamplesLoaded,
+                                                    *params);
+        } else if (coverParams) {
+            if (optimize) {
+              dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
+                                                             srcBuffer, sampleSizes, nbSamplesLoaded,
+                                                             coverParams);
+              if (!ZDICT_isError(dictSize)) {
+                  unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
+                  DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
+                              coverParams->steps, splitPercentage);
+              }
+            } else {
+              dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
+                                                     sampleSizes, nbSamplesLoaded, *coverParams);
+            }
+        } else if (fastCoverParams != NULL) {
+            if (optimize) {
+              dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
+                                                              srcBuffer, sampleSizes, nbSamplesLoaded,
+                                                              fastCoverParams);
+              if (!ZDICT_isError(dictSize)) {
+                unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
+                DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
+                            fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
+                            fastCoverParams->accel);
+              }
+            } else {
+              dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
+                                                        sampleSizes, nbSamplesLoaded, *fastCoverParams);
+            }
+        } else {
+            assert(0 /* Impossible */);
+        }
+        if (ZDICT_isError(dictSize)) {
+            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
+            result = 1;
+            goto _cleanup;
+        }
+        /* save dict */
+        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
+        DiB_saveDict(dictFileName, dictBuffer, dictSize);
+    }
+
+    /* clean up */
+_cleanup:
+    free(srcBuffer);
+    free(sampleSizes);
+    free(dictBuffer);
+    return result;
+}
--- a/third_party/zstd/programs/dibio.h
+++ b/third_party/zstd/programs/dibio.h
@ -0,0 +1,40 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This library is designed for a single-threaded console application.
+*  It exit() and printf() into stderr when it encounters an error condition. */
+
+#ifndef DIBIO_H_003
+#define DIBIO_H_003
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#define ZDICT_STATIC_LINKING_ONLY
+#include "third_party/zstd/zdict.h"     /* ZDICT_params_t */
+
+
+/*-*************************************
+*  Public functions
+***************************************/
+/*! DiB_trainFromFiles() :
+    Train a dictionary from a set of files provided by `fileNamesTable`.
+    Resulting dictionary is written into file `dictFileName`.
+    `parameters` is optional and can be provided with values set to 0, meaning "default".
+    @return : 0 == ok. Any other : error.
+*/
+int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
+                       const char** fileNamesTable, int nbFiles, size_t chunkSize,
+                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit);
+
+#endif
--- a/third_party/zstd/programs/fileio.c
+++ b/third_party/zstd/programs/fileio.c
--- a/third_party/zstd/programs/fileio.h
+++ b/third_party/zstd/programs/fileio.h
@ -0,0 +1,182 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#ifndef FILEIO_H_23981798732
+#define FILEIO_H_23981798732
+
+#include "third_party/zstd/programs/fileio_types.h"
+#include "third_party/zstd/programs/util.h"                  /* FileNamesTable */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_compressionParameters */
+#include "third_party/zstd/zstd.h"           /* ZSTD_* */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* *************************************
+*  Special i/o constants
+**************************************/
+#define stdinmark  "/*stdin*\\"
+#define stdoutmark "/*stdout*\\"
+#ifdef _WIN32
+#  define nulmark "NUL"
+#else
+#  define nulmark "/dev/null"
+#endif
+
+/**
+ * We test whether the extension we found starts with 't', and if so, we append
+ * ".tar" to the end of the output name.
+ */
+#define LZMA_EXTENSION  ".lzma"
+#define XZ_EXTENSION    ".xz"
+#define TXZ_EXTENSION   ".txz"
+
+#define GZ_EXTENSION    ".gz"
+#define TGZ_EXTENSION   ".tgz"
+
+#define ZSTD_EXTENSION  ".zst"
+#define TZSTD_EXTENSION ".tzst"
+#define ZSTD_ALT_EXTENSION  ".zstd" /* allow decompression of .zstd files */
+
+#define LZ4_EXTENSION   ".lz4"
+#define TLZ4_EXTENSION  ".tlz4"
+
+
+/*-*************************************
+*  Types
+***************************************/
+FIO_prefs_t* FIO_createPreferences(void);
+void FIO_freePreferences(FIO_prefs_t* const prefs);
+
+/* Mutable struct containing relevant context and state regarding (de)compression with respect to file I/O */
+typedef struct FIO_ctx_s FIO_ctx_t;
+
+FIO_ctx_t* FIO_createContext(void);
+void FIO_freeContext(FIO_ctx_t* const fCtx);
+
+
+/*-*************************************
+*  Parameters
+***************************************/
+/* FIO_prefs_t functions */
+void FIO_setCompressionType(FIO_prefs_t* const prefs, FIO_compressionType_t compressionType);
+void FIO_overwriteMode(FIO_prefs_t* const prefs);
+void FIO_setAdaptiveMode(FIO_prefs_t* const prefs, int adapt);
+void FIO_setAdaptMin(FIO_prefs_t* const prefs, int minCLevel);
+void FIO_setAdaptMax(FIO_prefs_t* const prefs, int maxCLevel);
+void FIO_setUseRowMatchFinder(FIO_prefs_t* const prefs, int useRowMatchFinder);
+void FIO_setBlockSize(FIO_prefs_t* const prefs, int blockSize);
+void FIO_setChecksumFlag(FIO_prefs_t* const prefs, int checksumFlag);
+void FIO_setDictIDFlag(FIO_prefs_t* const prefs, int dictIDFlag);
+void FIO_setLdmBucketSizeLog(FIO_prefs_t* const prefs, int ldmBucketSizeLog);
+void FIO_setLdmFlag(FIO_prefs_t* const prefs, unsigned ldmFlag);
+void FIO_setLdmHashRateLog(FIO_prefs_t* const prefs, int ldmHashRateLog);
+void FIO_setLdmHashLog(FIO_prefs_t* const prefs, int ldmHashLog);
+void FIO_setLdmMinMatch(FIO_prefs_t* const prefs, int ldmMinMatch);
+void FIO_setMemLimit(FIO_prefs_t* const prefs, unsigned memLimit);
+void FIO_setNbWorkers(FIO_prefs_t* const prefs, int nbWorkers);
+void FIO_setOverlapLog(FIO_prefs_t* const prefs, int overlapLog);
+void FIO_setRemoveSrcFile(FIO_prefs_t* const prefs, int flag);
+void FIO_setSparseWrite(FIO_prefs_t* const prefs, int sparse);  /**< 0: no sparse; 1: disable on stdout; 2: always enabled */
+void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable);
+void FIO_setStreamSrcSize(FIO_prefs_t* const prefs, size_t streamSrcSize);
+void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize);
+void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint);
+void FIO_setTestMode(FIO_prefs_t* const prefs, int testMode);
+void FIO_setLiteralCompressionMode(
+        FIO_prefs_t* const prefs,
+        ZSTD_paramSwitch_e mode);
+
+void FIO_setProgressSetting(FIO_progressSetting_e progressSetting);
+void FIO_setNotificationLevel(int level);
+void FIO_setExcludeCompressedFile(FIO_prefs_t* const prefs, int excludeCompressedFiles);
+void FIO_setAllowBlockDevices(FIO_prefs_t* const prefs, int allowBlockDevices);
+void FIO_setPatchFromMode(FIO_prefs_t* const prefs, int value);
+void FIO_setContentSize(FIO_prefs_t* const prefs, int value);
+void FIO_displayCompressionParameters(const FIO_prefs_t* prefs);
+void FIO_setAsyncIOFlag(FIO_prefs_t* const prefs, int value);
+void FIO_setPassThroughFlag(FIO_prefs_t* const prefs, int value);
+void FIO_setMMapDict(FIO_prefs_t* const prefs, ZSTD_paramSwitch_e value);
+
+/* FIO_ctx_t functions */
+void FIO_setNbFilesTotal(FIO_ctx_t* const fCtx, int value);
+void FIO_setHasStdoutOutput(FIO_ctx_t* const fCtx, int value);
+void FIO_determineHasStdinInput(FIO_ctx_t* const fCtx, const FileNamesTable* const filenames);
+
+/*-*************************************
+*  Single File functions
+***************************************/
+/** FIO_compressFilename() :
+ * @return : 0 == ok;  1 == pb with src file. */
+int FIO_compressFilename (FIO_ctx_t* const fCtx, FIO_prefs_t* const prefs,
+                          const char* outfilename, const char* infilename,
+                          const char* dictFileName, int compressionLevel,
+                          ZSTD_compressionParameters comprParams);
+
+/** FIO_decompressFilename() :
+ * @return : 0 == ok;  1 == pb with src file. */
+int FIO_decompressFilename (FIO_ctx_t* const fCtx, FIO_prefs_t* const prefs,
+                            const char* outfilename, const char* infilename, const char* dictFileName);
+
+int FIO_listMultipleFiles(unsigned numFiles, const char** filenameTable, int displayLevel);
+
+
+/*-*************************************
+*  Multiple File functions
+***************************************/
+/** FIO_compressMultipleFilenames() :
+ * @return : nb of missing files */
+int FIO_compressMultipleFilenames(FIO_ctx_t* const fCtx,
+                                  FIO_prefs_t* const prefs,
+                                  const char** inFileNamesTable,
+                                  const char* outMirroredDirName,
+                                  const char* outDirName,
+                                  const char* outFileName, const char* suffix,
+                                  const char* dictFileName, int compressionLevel,
+                                  ZSTD_compressionParameters comprParams);
+
+/** FIO_decompressMultipleFilenames() :
+ * @return : nb of missing or skipped files */
+int FIO_decompressMultipleFilenames(FIO_ctx_t* const fCtx,
+                                    FIO_prefs_t* const prefs,
+                                    const char** srcNamesTable,
+                                    const char* outMirroredDirName,
+                                    const char* outDirName,
+                                    const char* outFileName,
+                                    const char* dictFileName);
+
+/* FIO_checkFilenameCollisions() :
+ * Checks for and warns if there are any files that would have the same output path
+ */
+int FIO_checkFilenameCollisions(const char** filenameTable, unsigned nbFiles);
+
+
+
+/*-*************************************
+*  Advanced stuff (should actually be hosted elsewhere)
+***************************************/
+
+/* custom crash signal handler */
+void FIO_addAbortHandler(void);
+
+char const* FIO_zlibVersion(void);
+char const* FIO_lz4Version(void);
+char const* FIO_lzmaVersion(void);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FILEIO_H_23981798732 */
--- a/third_party/zstd/programs/fileio_asyncio.c
+++ b/third_party/zstd/programs/fileio_asyncio.c
@ -0,0 +1,692 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "third_party/zstd/programs/platform.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"      /* fprintf, open, fdopen, fread, _fileno, stdin, stdout */
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"     /* malloc, free */
+#include "libc/assert.h"
+#include "libc/errno.h"      /* errno */
+
+#if defined (_MSC_VER)
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/stat.h"
+#include "libc/calls/struct/stat.macros.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/utime.h"
+#include "libc/time/time.h"
+// MISSING #include <io.h>
+#endif
+
+#include "third_party/zstd/programs/fileio_asyncio.h"
+#include "third_party/zstd/programs/fileio_common.h"
+
+/* **********************************************************************
+ *  Sparse write
+ ************************************************************************/
+
+/** AIO_fwriteSparse() :
+*  @return : storedSkips,
+*            argument for next call to AIO_fwriteSparse() or AIO_fwriteSparseEnd() */
+static unsigned
+AIO_fwriteSparse(FILE* file,
+                 const void* buffer, size_t bufferSize,
+                 const FIO_prefs_t* const prefs,
+                 unsigned storedSkips)
+{
+    const size_t* const bufferT = (const size_t*)buffer;   /* Buffer is supposed malloc'ed, hence aligned on size_t */
+    size_t bufferSizeT = bufferSize / sizeof(size_t);
+    const size_t* const bufferTEnd = bufferT + bufferSizeT;
+    const size_t* ptrT = bufferT;
+    static const size_t segmentSizeT = (32 KB) / sizeof(size_t);   /* check every 32 KB */
+
+    if (prefs->testMode) return 0;  /* do not output anything in test mode */
+
+    if (!prefs->sparseFileSupport) {  /* normal write */
+        size_t const sizeCheck = fwrite(buffer, 1, bufferSize, file);
+        if (sizeCheck != bufferSize)
+            EXM_THROW(70, "Write error : cannot write block : %s",
+                      strerror(errno));
+        return 0;
+    }
+
+    /* avoid int overflow */
+    if (storedSkips > 1 GB) {
+        if (LONG_SEEK(file, 1 GB, SEEK_CUR) != 0)
+        EXM_THROW(91, "1 GB skip error (sparse file support)");
+        storedSkips -= 1 GB;
+    }
+
+    while (ptrT < bufferTEnd) {
+        size_t nb0T;
+
+        /* adjust last segment if < 32 KB */
+        size_t seg0SizeT = segmentSizeT;
+        if (seg0SizeT > bufferSizeT) seg0SizeT = bufferSizeT;
+        bufferSizeT -= seg0SizeT;
+
+        /* count leading zeroes */
+        for (nb0T=0; (nb0T < seg0SizeT) && (ptrT[nb0T] == 0); nb0T++) ;
+        storedSkips += (unsigned)(nb0T * sizeof(size_t));
+
+        if (nb0T != seg0SizeT) {   /* not all 0s */
+            size_t const nbNon0ST = seg0SizeT - nb0T;
+            /* skip leading zeros */
+            if (LONG_SEEK(file, storedSkips, SEEK_CUR) != 0)
+                EXM_THROW(92, "Sparse skip error ; try --no-sparse");
+            storedSkips = 0;
+            /* write the rest */
+            if (fwrite(ptrT + nb0T, sizeof(size_t), nbNon0ST, file) != nbNon0ST)
+                EXM_THROW(93, "Write error : cannot write block : %s",
+                          strerror(errno));
+        }
+        ptrT += seg0SizeT;
+    }
+
+    {   static size_t const maskT = sizeof(size_t)-1;
+        if (bufferSize & maskT) {
+            /* size not multiple of sizeof(size_t) : implies end of block */
+            const char* const restStart = (const char*)bufferTEnd;
+            const char* restPtr = restStart;
+            const char* const restEnd = (const char*)buffer + bufferSize;
+            assert(restEnd > restStart && restEnd < restStart + sizeof(size_t));
+            for ( ; (restPtr < restEnd) && (*restPtr == 0); restPtr++) ;
+            storedSkips += (unsigned) (restPtr - restStart);
+            if (restPtr != restEnd) {
+                /* not all remaining bytes are 0 */
+                size_t const restSize = (size_t)(restEnd - restPtr);
+                if (LONG_SEEK(file, storedSkips, SEEK_CUR) != 0)
+                    EXM_THROW(92, "Sparse skip error ; try --no-sparse");
+                if (fwrite(restPtr, 1, restSize, file) != restSize)
+                    EXM_THROW(95, "Write error : cannot write end of decoded block : %s",
+                              strerror(errno));
+                storedSkips = 0;
+            }   }   }
+
+    return storedSkips;
+}
+
+static void
+AIO_fwriteSparseEnd(const FIO_prefs_t* const prefs, FILE* file, unsigned storedSkips)
+{
+    if (prefs->testMode) assert(storedSkips == 0);
+    if (storedSkips>0) {
+        assert(prefs->sparseFileSupport > 0);  /* storedSkips>0 implies sparse support is enabled */
+        (void)prefs;   /* assert can be disabled, in which case prefs becomes unused */
+        if (LONG_SEEK(file, storedSkips-1, SEEK_CUR) != 0)
+            EXM_THROW(69, "Final skip error (sparse file support)");
+        /* last zero must be explicitly written,
+         * so that skipped ones get implicitly translated as zero by FS */
+        {   const char lastZeroByte[1] = { 0 };
+            if (fwrite(lastZeroByte, 1, 1, file) != 1)
+                EXM_THROW(69, "Write error : cannot write last zero : %s", strerror(errno));
+        }   }
+}
+
+
+/* **********************************************************************
+ *  AsyncIO functionality
+ ************************************************************************/
+
+/* AIO_supported:
+ * Returns 1 if AsyncIO is supported on the system, 0 otherwise. */
+int AIO_supported(void) {
+#ifdef ZSTD_MULTITHREAD
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+/* ***********************************
+ *  Generic IoPool implementation
+ *************************************/
+
+static IOJob_t *AIO_IOPool_createIoJob(IOPoolCtx_t *ctx, size_t bufferSize) {
+    IOJob_t* const job  = (IOJob_t*) malloc(sizeof(IOJob_t));
+    void* const buffer = malloc(bufferSize);
+    if(!job || !buffer)
+        EXM_THROW(101, "Allocation error : not enough memory");
+    job->buffer = buffer;
+    job->bufferSize = bufferSize;
+    job->usedBufferSize = 0;
+    job->file = NULL;
+    job->ctx = ctx;
+    job->offset = 0;
+    return job;
+}
+
+
+/* AIO_IOPool_createThreadPool:
+ * Creates a thread pool and a mutex for threaded IO pool.
+ * Displays warning if asyncio is requested but MT isn't available. */
+static void AIO_IOPool_createThreadPool(IOPoolCtx_t* ctx, const FIO_prefs_t* prefs) {
+    ctx->threadPool = NULL;
+    ctx->threadPoolActive = 0;
+    if(prefs->asyncIO) {
+        if (ZSTD_pthread_mutex_init(&ctx->ioJobsMutex, NULL))
+            EXM_THROW(102,"Failed creating ioJobsMutex mutex");
+        /* We want MAX_IO_JOBS-2 queue items because we need to always have 1 free buffer to
+         * decompress into and 1 buffer that's actively written to disk and owned by the writing thread. */
+        assert(MAX_IO_JOBS >= 2);
+        ctx->threadPool = POOL_create(1, MAX_IO_JOBS - 2);
+        ctx->threadPoolActive = 1;
+        if (!ctx->threadPool)
+            EXM_THROW(104, "Failed creating I/O thread pool");
+    }
+}
+
+/* AIO_IOPool_init:
+ * Allocates and sets and a new I/O thread pool including its included availableJobs. */
+static void AIO_IOPool_init(IOPoolCtx_t* ctx, const FIO_prefs_t* prefs, POOL_function poolFunction, size_t bufferSize) {
+    int i;
+    AIO_IOPool_createThreadPool(ctx, prefs);
+    ctx->prefs = prefs;
+    ctx->poolFunction = poolFunction;
+    ctx->totalIoJobs = ctx->threadPool ? MAX_IO_JOBS : 2;
+    ctx->availableJobsCount = ctx->totalIoJobs;
+    for(i=0; i < ctx->availableJobsCount; i++) {
+        ctx->availableJobs[i] = AIO_IOPool_createIoJob(ctx, bufferSize);
+    }
+    ctx->jobBufferSize = bufferSize;
+    ctx->file = NULL;
+}
+
+
+/* AIO_IOPool_threadPoolActive:
+ * Check if current operation uses thread pool.
+ * Note that in some cases we have a thread pool initialized but choose not to use it. */
+static int AIO_IOPool_threadPoolActive(IOPoolCtx_t* ctx) {
+    return ctx->threadPool && ctx->threadPoolActive;
+}
+
+
+/* AIO_IOPool_lockJobsMutex:
+ * Locks the IO jobs mutex if threading is active */
+static void AIO_IOPool_lockJobsMutex(IOPoolCtx_t* ctx) {
+    if(AIO_IOPool_threadPoolActive(ctx))
+        ZSTD_pthread_mutex_lock(&ctx->ioJobsMutex);
+}
+
+/* AIO_IOPool_unlockJobsMutex:
+ * Unlocks the IO jobs mutex if threading is active */
+static void AIO_IOPool_unlockJobsMutex(IOPoolCtx_t* ctx) {
+    if(AIO_IOPool_threadPoolActive(ctx))
+        ZSTD_pthread_mutex_unlock(&ctx->ioJobsMutex);
+}
+
+/* AIO_IOPool_releaseIoJob:
+ * Releases an acquired job back to the pool. Doesn't execute the job. */
+static void AIO_IOPool_releaseIoJob(IOJob_t* job) {
+    IOPoolCtx_t* const ctx = (IOPoolCtx_t *) job->ctx;
+    AIO_IOPool_lockJobsMutex(ctx);
+    assert(ctx->availableJobsCount < ctx->totalIoJobs);
+    ctx->availableJobs[ctx->availableJobsCount++] = job;
+    AIO_IOPool_unlockJobsMutex(ctx);
+}
+
+/* AIO_IOPool_join:
+ * Waits for all tasks in the pool to finish executing. */
+static void AIO_IOPool_join(IOPoolCtx_t* ctx) {
+    if(AIO_IOPool_threadPoolActive(ctx))
+        POOL_joinJobs(ctx->threadPool);
+}
+
+/* AIO_IOPool_setThreaded:
+ * Allows (de)activating threaded mode, to be used when the expected overhead
+ * of threading costs more than the expected gains. */
+static void AIO_IOPool_setThreaded(IOPoolCtx_t* ctx, int threaded) {
+    assert(threaded == 0 || threaded == 1);
+    assert(ctx != NULL);
+    if(ctx->threadPoolActive != threaded) {
+        AIO_IOPool_join(ctx);
+        ctx->threadPoolActive = threaded;
+    }
+}
+
+/* AIO_IOPool_free:
+ * Release a previously allocated IO thread pool. Makes sure all tasks are done and released. */
+static void AIO_IOPool_destroy(IOPoolCtx_t* ctx) {
+    int i;
+    if(ctx->threadPool) {
+        /* Make sure we finish all tasks and then free the resources */
+        AIO_IOPool_join(ctx);
+        /* Make sure we are not leaking availableJobs */
+        assert(ctx->availableJobsCount == ctx->totalIoJobs);
+        POOL_free(ctx->threadPool);
+        ZSTD_pthread_mutex_destroy(&ctx->ioJobsMutex);
+    }
+    assert(ctx->file == NULL);
+    for(i=0; i<ctx->availableJobsCount; i++) {
+        IOJob_t* job = (IOJob_t*) ctx->availableJobs[i];
+        free(job->buffer);
+        free(job);
+    }
+}
+
+/* AIO_IOPool_acquireJob:
+ * Returns an available io job to be used for a future io. */
+static IOJob_t* AIO_IOPool_acquireJob(IOPoolCtx_t* ctx) {
+    IOJob_t *job;
+    assert(ctx->file != NULL || ctx->prefs->testMode);
+    AIO_IOPool_lockJobsMutex(ctx);
+    assert(ctx->availableJobsCount > 0);
+    job = (IOJob_t*) ctx->availableJobs[--ctx->availableJobsCount];
+    AIO_IOPool_unlockJobsMutex(ctx);
+    job->usedBufferSize = 0;
+    job->file = ctx->file;
+    job->offset = 0;
+    return job;
+}
+
+
+/* AIO_IOPool_setFile:
+ * Sets the destination file for future files in the pool.
+ * Requires completion of all queued jobs and release of all otherwise acquired jobs. */
+static void AIO_IOPool_setFile(IOPoolCtx_t* ctx, FILE* file) {
+    assert(ctx!=NULL);
+    AIO_IOPool_join(ctx);
+    assert(ctx->availableJobsCount == ctx->totalIoJobs);
+    ctx->file = file;
+}
+
+static FILE* AIO_IOPool_getFile(const IOPoolCtx_t* ctx) {
+    return ctx->file;
+}
+
+/* AIO_IOPool_enqueueJob:
+ * Enqueues an io job for execution.
+ * The queued job shouldn't be used directly after queueing it. */
+static void AIO_IOPool_enqueueJob(IOJob_t* job) {
+    IOPoolCtx_t* const ctx = (IOPoolCtx_t *)job->ctx;
+    if(AIO_IOPool_threadPoolActive(ctx))
+        POOL_add(ctx->threadPool, ctx->poolFunction, job);
+    else
+        ctx->poolFunction(job);
+}
+
+/* ***********************************
+ *  WritePool implementation
+ *************************************/
+
+/* AIO_WritePool_acquireJob:
+ * Returns an available write job to be used for a future write. */
+IOJob_t* AIO_WritePool_acquireJob(WritePoolCtx_t* ctx) {
+    return AIO_IOPool_acquireJob(&ctx->base);
+}
+
+/* AIO_WritePool_enqueueAndReacquireWriteJob:
+ * Queues a write job for execution and acquires a new one.
+ * After execution `job`'s pointed value would change to the newly acquired job.
+ * Make sure to set `usedBufferSize` to the wanted length before call.
+ * The queued job shouldn't be used directly after queueing it. */
+void AIO_WritePool_enqueueAndReacquireWriteJob(IOJob_t **job) {
+    AIO_IOPool_enqueueJob(*job);
+    *job = AIO_IOPool_acquireJob((IOPoolCtx_t *)(*job)->ctx);
+}
+
+/* AIO_WritePool_sparseWriteEnd:
+ * Ends sparse writes to the current file.
+ * Blocks on completion of all current write jobs before executing. */
+void AIO_WritePool_sparseWriteEnd(WritePoolCtx_t* ctx) {
+    assert(ctx != NULL);
+    AIO_IOPool_join(&ctx->base);
+    AIO_fwriteSparseEnd(ctx->base.prefs, ctx->base.file, ctx->storedSkips);
+    ctx->storedSkips = 0;
+}
+
+/* AIO_WritePool_setFile:
+ * Sets the destination file for future writes in the pool.
+ * Requires completion of all queues write jobs and release of all otherwise acquired jobs.
+ * Also requires ending of sparse write if a previous file was used in sparse mode. */
+void AIO_WritePool_setFile(WritePoolCtx_t* ctx, FILE* file) {
+    AIO_IOPool_setFile(&ctx->base, file);
+    assert(ctx->storedSkips == 0);
+}
+
+/* AIO_WritePool_getFile:
+ * Returns the file the writePool is currently set to write to. */
+FILE* AIO_WritePool_getFile(const WritePoolCtx_t* ctx) {
+    return AIO_IOPool_getFile(&ctx->base);
+}
+
+/* AIO_WritePool_releaseIoJob:
+ * Releases an acquired job back to the pool. Doesn't execute the job. */
+void AIO_WritePool_releaseIoJob(IOJob_t* job) {
+    AIO_IOPool_releaseIoJob(job);
+}
+
+/* AIO_WritePool_closeFile:
+ * Ends sparse write and closes the writePool's current file and sets the file to NULL.
+ * Requires completion of all queues write jobs and release of all otherwise acquired jobs.  */
+int AIO_WritePool_closeFile(WritePoolCtx_t* ctx) {
+    FILE* const dstFile = ctx->base.file;
+    assert(dstFile!=NULL || ctx->base.prefs->testMode!=0);
+    AIO_WritePool_sparseWriteEnd(ctx);
+    AIO_IOPool_setFile(&ctx->base, NULL);
+    return fclose(dstFile);
+}
+
+/* AIO_WritePool_executeWriteJob:
+ * Executes a write job synchronously. Can be used as a function for a thread pool. */
+static void AIO_WritePool_executeWriteJob(void* opaque){
+    IOJob_t* const job = (IOJob_t*) opaque;
+    WritePoolCtx_t* const ctx = (WritePoolCtx_t*) job->ctx;
+    ctx->storedSkips = AIO_fwriteSparse(job->file, job->buffer, job->usedBufferSize, ctx->base.prefs, ctx->storedSkips);
+    AIO_IOPool_releaseIoJob(job);
+}
+
+/* AIO_WritePool_create:
+ * Allocates and sets and a new write pool including its included jobs. */
+WritePoolCtx_t* AIO_WritePool_create(const FIO_prefs_t* prefs, size_t bufferSize) {
+    WritePoolCtx_t* const ctx = (WritePoolCtx_t*) malloc(sizeof(WritePoolCtx_t));
+    if(!ctx) EXM_THROW(100, "Allocation error : not enough memory");
+    AIO_IOPool_init(&ctx->base, prefs, AIO_WritePool_executeWriteJob, bufferSize);
+    ctx->storedSkips = 0;
+    return ctx;
+}
+
+/* AIO_WritePool_free:
+ * Frees and releases a writePool and its resources. Closes destination file if needs to. */
+void AIO_WritePool_free(WritePoolCtx_t* ctx) {
+    /* Make sure we finish all tasks and then free the resources */
+    if(AIO_WritePool_getFile(ctx))
+        AIO_WritePool_closeFile(ctx);
+    AIO_IOPool_destroy(&ctx->base);
+    assert(ctx->storedSkips==0);
+    free(ctx);
+}
+
+/* AIO_WritePool_setAsync:
+ * Allows (de)activating async mode, to be used when the expected overhead
+ * of asyncio costs more than the expected gains. */
+void AIO_WritePool_setAsync(WritePoolCtx_t* ctx, int async) {
+    AIO_IOPool_setThreaded(&ctx->base, async);
+}
+
+
+/* ***********************************
+ *  ReadPool implementation
+ *************************************/
+static void AIO_ReadPool_releaseAllCompletedJobs(ReadPoolCtx_t* ctx) {
+    int i;
+    for(i=0; i<ctx->completedJobsCount; i++) {
+        IOJob_t* job = (IOJob_t*) ctx->completedJobs[i];
+        AIO_IOPool_releaseIoJob(job);
+    }
+    ctx->completedJobsCount = 0;
+}
+
+static void AIO_ReadPool_addJobToCompleted(IOJob_t* job) {
+    ReadPoolCtx_t* const ctx = (ReadPoolCtx_t *)job->ctx;
+    AIO_IOPool_lockJobsMutex(&ctx->base);
+    assert(ctx->completedJobsCount < MAX_IO_JOBS);
+    ctx->completedJobs[ctx->completedJobsCount++] = job;
+    if(AIO_IOPool_threadPoolActive(&ctx->base)) {
+        ZSTD_pthread_cond_signal(&ctx->jobCompletedCond);
+    }
+    AIO_IOPool_unlockJobsMutex(&ctx->base);
+}
+
+/* AIO_ReadPool_findNextWaitingOffsetCompletedJob_locked:
+ * Looks through the completed jobs for a job matching the waitingOnOffset and returns it,
+ * if job wasn't found returns NULL.
+ * IMPORTANT: assumes ioJobsMutex is locked. */
+static IOJob_t* AIO_ReadPool_findNextWaitingOffsetCompletedJob_locked(ReadPoolCtx_t* ctx) {
+    IOJob_t *job = NULL;
+    int i;
+    /* This implementation goes through all completed jobs and looks for the one matching the next offset.
+     * While not strictly needed for a single threaded reader implementation (as in such a case we could expect
+     * reads to be completed in order) this implementation was chosen as it better fits other asyncio
+     * interfaces (such as io_uring) that do not provide promises regarding order of completion. */
+    for (i=0; i<ctx->completedJobsCount; i++) {
+        job = (IOJob_t *) ctx->completedJobs[i];
+        if (job->offset == ctx->waitingOnOffset) {
+            ctx->completedJobs[i] = ctx->completedJobs[--ctx->completedJobsCount];
+            return job;
+        }
+    }
+    return NULL;
+}
+
+/* AIO_ReadPool_numReadsInFlight:
+ * Returns the number of IO read jobs currently in flight. */
+static size_t AIO_ReadPool_numReadsInFlight(ReadPoolCtx_t* ctx) {
+    const size_t jobsHeld = (ctx->currentJobHeld==NULL ? 0 : 1);
+    return ctx->base.totalIoJobs - (ctx->base.availableJobsCount + ctx->completedJobsCount + jobsHeld);
+}
+
+/* AIO_ReadPool_getNextCompletedJob:
+ * Returns a completed IOJob_t for the next read in line based on waitingOnOffset and advances waitingOnOffset.
+ * Would block. */
+static IOJob_t* AIO_ReadPool_getNextCompletedJob(ReadPoolCtx_t* ctx) {
+    IOJob_t *job = NULL;
+    AIO_IOPool_lockJobsMutex(&ctx->base);
+
+    job = AIO_ReadPool_findNextWaitingOffsetCompletedJob_locked(ctx);
+
+    /* As long as we didn't find the job matching the next read, and we have some reads in flight continue waiting */
+    while (!job && (AIO_ReadPool_numReadsInFlight(ctx) > 0)) {
+        assert(ctx->base.threadPool != NULL); /* we shouldn't be here if we work in sync mode */
+        ZSTD_pthread_cond_wait(&ctx->jobCompletedCond, &ctx->base.ioJobsMutex);
+        job = AIO_ReadPool_findNextWaitingOffsetCompletedJob_locked(ctx);
+    }
+
+    if(job) {
+        assert(job->offset == ctx->waitingOnOffset);
+        ctx->waitingOnOffset += job->usedBufferSize;
+    }
+
+    AIO_IOPool_unlockJobsMutex(&ctx->base);
+    return job;
+}
+
+
+/* AIO_ReadPool_executeReadJob:
+ * Executes a read job synchronously. Can be used as a function for a thread pool. */
+static void AIO_ReadPool_executeReadJob(void* opaque){
+    IOJob_t* const job = (IOJob_t*) opaque;
+    ReadPoolCtx_t* const ctx = (ReadPoolCtx_t *)job->ctx;
+    if(ctx->reachedEof) {
+        job->usedBufferSize = 0;
+        AIO_ReadPool_addJobToCompleted(job);
+        return;
+    }
+    job->usedBufferSize = fread(job->buffer, 1, job->bufferSize, job->file);
+    if(job->usedBufferSize < job->bufferSize) {
+        if(ferror(job->file)) {
+            EXM_THROW(37, "Read error");
+        } else if(feof(job->file)) {
+            ctx->reachedEof = 1;
+        } else {
+            EXM_THROW(37, "Unexpected short read");
+        }
+    }
+    AIO_ReadPool_addJobToCompleted(job);
+}
+
+static void AIO_ReadPool_enqueueRead(ReadPoolCtx_t* ctx) {
+    IOJob_t* const job = AIO_IOPool_acquireJob(&ctx->base);
+    job->offset = ctx->nextReadOffset;
+    ctx->nextReadOffset += job->bufferSize;
+    AIO_IOPool_enqueueJob(job);
+}
+
+static void AIO_ReadPool_startReading(ReadPoolCtx_t* ctx) {
+    int i;
+    for (i = 0; i < ctx->base.availableJobsCount; i++) {
+        AIO_ReadPool_enqueueRead(ctx);
+    }
+}
+
+/* AIO_ReadPool_setFile:
+ * Sets the source file for future read in the pool. Initiates reading immediately if file is not NULL.
+ * Waits for all current enqueued tasks to complete if a previous file was set. */
+void AIO_ReadPool_setFile(ReadPoolCtx_t* ctx, FILE* file) {
+    assert(ctx!=NULL);
+    AIO_IOPool_join(&ctx->base);
+    AIO_ReadPool_releaseAllCompletedJobs(ctx);
+    if (ctx->currentJobHeld) {
+        AIO_IOPool_releaseIoJob((IOJob_t *)ctx->currentJobHeld);
+        ctx->currentJobHeld = NULL;
+    }
+    AIO_IOPool_setFile(&ctx->base, file);
+    ctx->nextReadOffset = 0;
+    ctx->waitingOnOffset = 0;
+    ctx->srcBuffer = ctx->coalesceBuffer;
+    ctx->srcBufferLoaded = 0;
+    ctx->reachedEof = 0;
+    if(file != NULL)
+        AIO_ReadPool_startReading(ctx);
+}
+
+/* AIO_ReadPool_create:
+ * Allocates and sets and a new readPool including its included jobs.
+ * bufferSize should be set to the maximal buffer we want to read at a time, will also be used
+ * as our basic read size. */
+ReadPoolCtx_t* AIO_ReadPool_create(const FIO_prefs_t* prefs, size_t bufferSize) {
+    ReadPoolCtx_t* const ctx = (ReadPoolCtx_t*) malloc(sizeof(ReadPoolCtx_t));
+    if(!ctx) EXM_THROW(100, "Allocation error : not enough memory");
+    AIO_IOPool_init(&ctx->base, prefs, AIO_ReadPool_executeReadJob, bufferSize);
+
+    ctx->coalesceBuffer = (U8*) malloc(bufferSize * 2);
+    ctx->srcBuffer = ctx->coalesceBuffer;
+    ctx->srcBufferLoaded = 0;
+    ctx->completedJobsCount = 0;
+    ctx->currentJobHeld = NULL;
+
+    if(ctx->base.threadPool)
+        if (ZSTD_pthread_cond_init(&ctx->jobCompletedCond, NULL))
+            EXM_THROW(103,"Failed creating jobCompletedCond cond");
+
+    return ctx;
+}
+
+/* AIO_ReadPool_free:
+ * Frees and releases a readPool and its resources. Closes source file. */
+void AIO_ReadPool_free(ReadPoolCtx_t* ctx) {
+    if(AIO_ReadPool_getFile(ctx))
+        AIO_ReadPool_closeFile(ctx);
+    if(ctx->base.threadPool)
+        ZSTD_pthread_cond_destroy(&ctx->jobCompletedCond);
+    AIO_IOPool_destroy(&ctx->base);
+    free(ctx->coalesceBuffer);
+    free(ctx);
+}
+
+/* AIO_ReadPool_consumeBytes:
+ * Consumes byes from srcBuffer's beginning and updates srcBufferLoaded accordingly. */
+void AIO_ReadPool_consumeBytes(ReadPoolCtx_t* ctx, size_t n) {
+    assert(n <= ctx->srcBufferLoaded);
+    ctx->srcBufferLoaded -= n;
+    ctx->srcBuffer += n;
+}
+
+/* AIO_ReadPool_releaseCurrentlyHeldAndGetNext:
+ * Release the current held job and get the next one, returns NULL if no next job available. */
+static IOJob_t* AIO_ReadPool_releaseCurrentHeldAndGetNext(ReadPoolCtx_t* ctx) {
+    if (ctx->currentJobHeld) {
+        AIO_IOPool_releaseIoJob((IOJob_t *)ctx->currentJobHeld);
+        ctx->currentJobHeld = NULL;
+        AIO_ReadPool_enqueueRead(ctx);
+    }
+    ctx->currentJobHeld = AIO_ReadPool_getNextCompletedJob(ctx);
+    return (IOJob_t*) ctx->currentJobHeld;
+}
+
+/* AIO_ReadPool_fillBuffer:
+ * Tries to fill the buffer with at least n or jobBufferSize bytes (whichever is smaller).
+ * Returns if srcBuffer has at least the expected number of bytes loaded or if we've reached the end of the file.
+ * Return value is the number of bytes added to the buffer.
+ * Note that srcBuffer might have up to 2 times jobBufferSize bytes. */
+size_t AIO_ReadPool_fillBuffer(ReadPoolCtx_t* ctx, size_t n) {
+    IOJob_t *job;
+    int useCoalesce = 0;
+    if(n > ctx->base.jobBufferSize)
+        n = ctx->base.jobBufferSize;
+
+    /* We are good, don't read anything */
+    if (ctx->srcBufferLoaded >= n)
+        return 0;
+
+    /* We still have bytes loaded, but not enough to satisfy caller. We need to get the next job
+     * and coalesce the remaining bytes with the next job's buffer */
+    if (ctx->srcBufferLoaded > 0) {
+        useCoalesce = 1;
+        memcpy(ctx->coalesceBuffer, ctx->srcBuffer, ctx->srcBufferLoaded);
+        ctx->srcBuffer = ctx->coalesceBuffer;
+    }
+
+    /* Read the next chunk */
+    job = AIO_ReadPool_releaseCurrentHeldAndGetNext(ctx);
+    if(!job)
+        return 0;
+    if(useCoalesce) {
+        assert(ctx->srcBufferLoaded + job->usedBufferSize <= 2*ctx->base.jobBufferSize);
+        memcpy(ctx->coalesceBuffer + ctx->srcBufferLoaded, job->buffer, job->usedBufferSize);
+        ctx->srcBufferLoaded += job->usedBufferSize;
+    }
+    else {
+        ctx->srcBuffer = (U8 *) job->buffer;
+        ctx->srcBufferLoaded = job->usedBufferSize;
+    }
+    return job->usedBufferSize;
+}
+
+/* AIO_ReadPool_consumeAndRefill:
+ * Consumes the current buffer and refills it with bufferSize bytes. */
+size_t AIO_ReadPool_consumeAndRefill(ReadPoolCtx_t* ctx) {
+    AIO_ReadPool_consumeBytes(ctx, ctx->srcBufferLoaded);
+    return AIO_ReadPool_fillBuffer(ctx, ctx->base.jobBufferSize);
+}
+
+/* AIO_ReadPool_getFile:
+ * Returns the current file set for the read pool. */
+FILE* AIO_ReadPool_getFile(const ReadPoolCtx_t* ctx) {
+    return AIO_IOPool_getFile(&ctx->base);
+}
+
+/* AIO_ReadPool_closeFile:
+ * Closes the current set file. Waits for all current enqueued tasks to complete and resets state. */
+int AIO_ReadPool_closeFile(ReadPoolCtx_t* ctx) {
+    FILE* const file = AIO_ReadPool_getFile(ctx);
+    AIO_ReadPool_setFile(ctx, NULL);
+    return fclose(file);
+}
+
+/* AIO_ReadPool_setAsync:
+ * Allows (de)activating async mode, to be used when the expected overhead
+ * of asyncio costs more than the expected gains. */
+void AIO_ReadPool_setAsync(ReadPoolCtx_t* ctx, int async) {
+    AIO_IOPool_setThreaded(&ctx->base, async);
+}
--- a/third_party/zstd/programs/fileio_asyncio.h
+++ b/third_party/zstd/programs/fileio_asyncio.h
@ -0,0 +1,204 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*
+  * FileIO AsyncIO exposes read/write IO pools that allow doing IO asynchronously.
+  * Current implementation relies on having one thread that reads and one that
+  * writes.
+  * Each IO pool supports up to `MAX_IO_JOBS` that can be enqueued for work, but
+  * are performed serially by the appropriate worker thread.
+  * Most systems exposes better primitives to perform asynchronous IO, such as
+  * io_uring on newer linux systems. The API is built in such a way that in the
+  * future we could replace the threads with better solutions when available.
+  */
+
+#ifndef ZSTD_FILEIO_ASYNCIO_H
+#define ZSTD_FILEIO_ASYNCIO_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "third_party/zstd/lib/common/mem.h"     /* U32, U64 */
+#include "third_party/zstd/programs/fileio_types.h"
+#include "third_party/zstd/programs/platform.h"
+#include "third_party/zstd/programs/util.h"
+#include "third_party/zstd/lib/common/pool.h"
+#include "third_party/zstd/lib/common/threading.h"
+
+#define MAX_IO_JOBS          (10)
+
+typedef struct {
+    /* These struct fields should be set only on creation and not changed afterwards */
+    POOL_ctx* threadPool;
+    int threadPoolActive;
+    int totalIoJobs;
+    const FIO_prefs_t* prefs;
+    POOL_function poolFunction;
+
+    /* Controls the file we currently write to, make changes only by using provided utility functions */
+    FILE* file;
+
+    /* The jobs and availableJobsCount fields are accessed by both the main and worker threads and should
+     * only be mutated after locking the mutex */
+    ZSTD_pthread_mutex_t ioJobsMutex;
+    void* availableJobs[MAX_IO_JOBS];
+    int availableJobsCount;
+    size_t jobBufferSize;
+} IOPoolCtx_t;
+
+typedef struct {
+    IOPoolCtx_t base;
+
+    /* State regarding the currently read file */
+    int reachedEof;
+    U64 nextReadOffset;
+    U64 waitingOnOffset;
+
+    /* We may hold an IOJob object as needed if we actively expose its buffer. */
+    void *currentJobHeld;
+
+    /* Coalesce buffer is used to join two buffers in case where we need to read more bytes than left in
+     * the first of them. Shouldn't be accessed from outside ot utility functions. */
+    U8 *coalesceBuffer;
+
+    /* Read buffer can be used by consumer code, take care when copying this pointer aside as it might
+     * change when consuming / refilling buffer. */
+    U8 *srcBuffer;
+    size_t srcBufferLoaded;
+
+    /* We need to know what tasks completed so we can use their buffers when their time comes.
+     * Should only be accessed after locking base.ioJobsMutex . */
+    void* completedJobs[MAX_IO_JOBS];
+    int completedJobsCount;
+    ZSTD_pthread_cond_t jobCompletedCond;
+} ReadPoolCtx_t;
+
+typedef struct {
+    IOPoolCtx_t base;
+    unsigned storedSkips;
+} WritePoolCtx_t;
+
+typedef struct {
+    /* These fields are automatically set and shouldn't be changed by non WritePool code. */
+    void *ctx;
+    FILE* file;
+    void *buffer;
+    size_t bufferSize;
+
+    /* This field should be changed before a job is queued for execution and should contain the number
+     * of bytes to write from the buffer. */
+    size_t usedBufferSize;
+    U64 offset;
+} IOJob_t;
+
+/* AIO_supported:
+ * Returns 1 if AsyncIO is supported on the system, 0 otherwise. */
+int AIO_supported(void);
+
+
+/* AIO_WritePool_releaseIoJob:
+ * Releases an acquired job back to the pool. Doesn't execute the job. */
+void AIO_WritePool_releaseIoJob(IOJob_t *job);
+
+/* AIO_WritePool_acquireJob:
+ * Returns an available write job to be used for a future write. */
+IOJob_t* AIO_WritePool_acquireJob(WritePoolCtx_t *ctx);
+
+/* AIO_WritePool_enqueueAndReacquireWriteJob:
+ * Enqueues a write job for execution and acquires a new one.
+ * After execution `job`'s pointed value would change to the newly acquired job.
+ * Make sure to set `usedBufferSize` to the wanted length before call.
+ * The queued job shouldn't be used directly after queueing it. */
+void AIO_WritePool_enqueueAndReacquireWriteJob(IOJob_t **job);
+
+/* AIO_WritePool_sparseWriteEnd:
+ * Ends sparse writes to the current file.
+ * Blocks on completion of all current write jobs before executing. */
+void AIO_WritePool_sparseWriteEnd(WritePoolCtx_t *ctx);
+
+/* AIO_WritePool_setFile:
+ * Sets the destination file for future writes in the pool.
+ * Requires completion of all queues write jobs and release of all otherwise acquired jobs.
+ * Also requires ending of sparse write if a previous file was used in sparse mode. */
+void AIO_WritePool_setFile(WritePoolCtx_t *ctx, FILE* file);
+
+/* AIO_WritePool_getFile:
+ * Returns the file the writePool is currently set to write to. */
+FILE* AIO_WritePool_getFile(const WritePoolCtx_t* ctx);
+
+/* AIO_WritePool_closeFile:
+ * Ends sparse write and closes the writePool's current file and sets the file to NULL.
+ * Requires completion of all queues write jobs and release of all otherwise acquired jobs.  */
+int AIO_WritePool_closeFile(WritePoolCtx_t *ctx);
+
+/* AIO_WritePool_create:
+ * Allocates and sets and a new write pool including its included jobs.
+ * bufferSize should be set to the maximal buffer we want to write to at a time. */
+WritePoolCtx_t* AIO_WritePool_create(const FIO_prefs_t* prefs, size_t bufferSize);
+
+/* AIO_WritePool_free:
+ * Frees and releases a writePool and its resources. Closes destination file. */
+void AIO_WritePool_free(WritePoolCtx_t* ctx);
+
+/* AIO_WritePool_setAsync:
+ * Allows (de)activating async mode, to be used when the expected overhead
+ * of asyncio costs more than the expected gains. */
+void AIO_WritePool_setAsync(WritePoolCtx_t* ctx, int async);
+
+/* AIO_ReadPool_create:
+ * Allocates and sets and a new readPool including its included jobs.
+ * bufferSize should be set to the maximal buffer we want to read at a time, will also be used
+ * as our basic read size. */
+ReadPoolCtx_t* AIO_ReadPool_create(const FIO_prefs_t* prefs, size_t bufferSize);
+
+/* AIO_ReadPool_free:
+ * Frees and releases a readPool and its resources. Closes source file. */
+void AIO_ReadPool_free(ReadPoolCtx_t* ctx);
+
+/* AIO_ReadPool_setAsync:
+ * Allows (de)activating async mode, to be used when the expected overhead
+ * of asyncio costs more than the expected gains. */
+void AIO_ReadPool_setAsync(ReadPoolCtx_t* ctx, int async);
+
+/* AIO_ReadPool_consumeBytes:
+ * Consumes byes from srcBuffer's beginning and updates srcBufferLoaded accordingly. */
+void AIO_ReadPool_consumeBytes(ReadPoolCtx_t *ctx, size_t n);
+
+/* AIO_ReadPool_fillBuffer:
+ * Makes sure buffer has at least n bytes loaded (as long as n is not bigger than the initialized bufferSize).
+ * Returns if srcBuffer has at least n bytes loaded or if we've reached the end of the file.
+ * Return value is the number of bytes added to the buffer.
+ * Note that srcBuffer might have up to 2 times bufferSize bytes. */
+size_t AIO_ReadPool_fillBuffer(ReadPoolCtx_t *ctx, size_t n);
+
+/* AIO_ReadPool_consumeAndRefill:
+ * Consumes the current buffer and refills it with bufferSize bytes. */
+size_t AIO_ReadPool_consumeAndRefill(ReadPoolCtx_t *ctx);
+
+/* AIO_ReadPool_setFile:
+ * Sets the source file for future read in the pool. Initiates reading immediately if file is not NULL.
+ * Waits for all current enqueued tasks to complete if a previous file was set. */
+void AIO_ReadPool_setFile(ReadPoolCtx_t *ctx, FILE* file);
+
+/* AIO_ReadPool_getFile:
+ * Returns the current file set for the read pool. */
+FILE* AIO_ReadPool_getFile(const ReadPoolCtx_t *ctx);
+
+/* AIO_ReadPool_closeFile:
+ * Closes the current set file. Waits for all current enqueued tasks to complete and resets state. */
+int AIO_ReadPool_closeFile(ReadPoolCtx_t *ctx);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FILEIO_ASYNCIO_H */
--- a/third_party/zstd/programs/fileio_common.h
+++ b/third_party/zstd/programs/fileio_common.h
@ -0,0 +1,144 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_FILEIO_COMMON_H
+#define ZSTD_FILEIO_COMMON_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include "third_party/zstd/lib/common/mem.h"     /* U32, U64 */
+#include "third_party/zstd/programs/fileio_types.h"
+#include "third_party/zstd/programs/platform.h"
+#include "third_party/zstd/programs/timefn.h"     /* UTIL_getTime, UTIL_clockSpanMicro */
+
+/*-*************************************
+*  Macros
+***************************************/
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+#undef MAX
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+extern FIO_display_prefs_t g_display_prefs;
+
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYOUT(...)      fprintf(stdout, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) { if (g_display_prefs.displayLevel>=l) { DISPLAY(__VA_ARGS__); } }
+
+extern UTIL_time_t g_displayClock;
+
+#define REFRESH_RATE  ((U64)(SEC_TO_MICRO / 6))
+#define READY_FOR_UPDATE() (UTIL_clockSpanMicro(g_displayClock) > REFRESH_RATE || g_display_prefs.displayLevel >= 4)
+#define DELAY_NEXT_UPDATE() { g_displayClock = UTIL_getTime(); }
+#define DISPLAYUPDATE(l, ...) {                              \
+        if (g_display_prefs.displayLevel>=l && (g_display_prefs.progressSetting != FIO_ps_never)) { \
+            if (READY_FOR_UPDATE()) { \
+                DELAY_NEXT_UPDATE();                         \
+                DISPLAY(__VA_ARGS__);                        \
+                if (g_display_prefs.displayLevel>=4) fflush(stderr);       \
+    }   }   }
+
+#define SHOULD_DISPLAY_SUMMARY() \
+    (g_display_prefs.displayLevel >= 2 || g_display_prefs.progressSetting == FIO_ps_always)
+#define SHOULD_DISPLAY_PROGRESS()                       \
+    (g_display_prefs.progressSetting != FIO_ps_never && SHOULD_DISPLAY_SUMMARY())
+#define DISPLAY_PROGRESS(...) { if (SHOULD_DISPLAY_PROGRESS()) { DISPLAYLEVEL(1, __VA_ARGS__); }}
+#define DISPLAYUPDATE_PROGRESS(...) { if (SHOULD_DISPLAY_PROGRESS()) { DISPLAYUPDATE(1, __VA_ARGS__); }}
+#define DISPLAY_SUMMARY(...) { if (SHOULD_DISPLAY_SUMMARY()) { DISPLAYLEVEL(1, __VA_ARGS__); } }
+
+#undef MIN  /* in case it would be already defined */
+#define MIN(a,b)    ((a) < (b) ? (a) : (b))
+
+
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DISPLAYLEVEL(1, "zstd: ");                                            \
+    DISPLAYLEVEL(5, "Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAYLEVEL(1, "error %i : ", error);                                \
+    DISPLAYLEVEL(1, __VA_ARGS__);                                         \
+    DISPLAYLEVEL(1, " \n");                                               \
+    exit(error);                                                          \
+}
+
+#define CHECK_V(v, f)                                \
+    v = f;                                           \
+    if (ZSTD_isError(v)) {                           \
+        DISPLAYLEVEL(5, "%s \n", #f);                \
+        EXM_THROW(11, "%s", ZSTD_getErrorName(v));   \
+    }
+#define CHECK(f) { size_t err; CHECK_V(err, f); }
+
+
+/* Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW */
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+#   define LONG_SEEK _fseeki64
+#   define LONG_TELL _ftelli64
+#elif !defined(__64BIT__) && (PLATFORM_POSIX_VERSION >= 200112L) /* No point defining Large file for 64 bit */
+#  define LONG_SEEK fseeko
+#  define LONG_TELL ftello
+#elif defined(__MINGW32__) && !defined(__STRICT_ANSI__) && !defined(__NO_MINGW_LFS) && defined(__MSVCRT__)
+#   define LONG_SEEK fseeko64
+#   define LONG_TELL ftello64
+#elif defined(_WIN32) && !defined(__DJGPP__)
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+    static int LONG_SEEK(FILE* file, __int64 offset, int origin) {
+        LARGE_INTEGER off;
+        DWORD method;
+        off.QuadPart = offset;
+        if (origin == SEEK_END)
+            method = FILE_END;
+        else if (origin == SEEK_CUR)
+            method = FILE_CURRENT;
+        else
+            method = FILE_BEGIN;
+
+        if (SetFilePointerEx((HANDLE) _get_osfhandle(_fileno(file)), off, NULL, method))
+            return 0;
+        else
+            return -1;
+    }
+    static __int64 LONG_TELL(FILE* file) {
+        LARGE_INTEGER off, newOff;
+        off.QuadPart = 0;
+        newOff.QuadPart = 0;
+        SetFilePointerEx((HANDLE) _get_osfhandle(_fileno(file)), off, &newOff, FILE_CURRENT);
+        return newOff.QuadPart;
+    }
+#else
+#   define LONG_SEEK fseek
+#   define LONG_TELL ftell
+#endif
+
+#if defined (__cplusplus)
+}
+#endif
+#endif /* ZSTD_FILEIO_COMMON_H */
--- a/third_party/zstd/programs/fileio_types.h
+++ b/third_party/zstd/programs/fileio_types.h
@ -0,0 +1,87 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef FILEIO_TYPES_HEADER
+#define FILEIO_TYPES_HEADER
+
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_compressionParameters */
+#include "third_party/zstd/zstd.h"           /* ZSTD_* */
+
+/*-*************************************
+*  Parameters: FIO_prefs_t
+***************************************/
+
+typedef struct FIO_display_prefs_s FIO_display_prefs_t;
+
+typedef enum { FIO_ps_auto, FIO_ps_never, FIO_ps_always } FIO_progressSetting_e;
+
+struct FIO_display_prefs_s {
+    int displayLevel;   /* 0 : no display;  1: errors;  2: + result + interaction + warnings;  3: + progression;  4: + information */
+    FIO_progressSetting_e progressSetting;
+};
+
+
+typedef enum { FIO_zstdCompression, FIO_gzipCompression, FIO_xzCompression, FIO_lzmaCompression, FIO_lz4Compression } FIO_compressionType_t;
+
+typedef struct FIO_prefs_s {
+
+    /* Algorithm preferences */
+    FIO_compressionType_t compressionType;
+    int sparseFileSupport;   /* 0: no sparse allowed; 1: auto (file yes, stdout no); 2: force sparse */
+    int dictIDFlag;
+    int checksumFlag;
+    int blockSize;
+    int overlapLog;
+    int adaptiveMode;
+    int useRowMatchFinder;
+    int rsyncable;
+    int minAdaptLevel;
+    int maxAdaptLevel;
+    int ldmFlag;
+    int ldmHashLog;
+    int ldmMinMatch;
+    int ldmBucketSizeLog;
+    int ldmHashRateLog;
+    size_t streamSrcSize;
+    size_t targetCBlockSize;
+    int srcSizeHint;
+    int testMode;
+    ZSTD_paramSwitch_e literalCompressionMode;
+
+    /* IO preferences */
+    int removeSrcFile;
+    int overwrite;
+    int asyncIO;
+
+    /* Computation resources preferences */
+    unsigned memLimit;
+    int nbWorkers;
+
+    int excludeCompressedFiles;
+    int patchFromMode;
+    int contentSize;
+    int allowBlockDevices;
+    int passThrough;
+    ZSTD_paramSwitch_e mmapDict;
+} FIO_prefs_t;
+
+typedef enum {FIO_mallocDict, FIO_mmapDict} FIO_dictBufferType_t;
+
+typedef struct {
+    void* dictBuffer;
+    size_t dictBufferSize;
+    FIO_dictBufferType_t dictBufferType;
+#if defined(_MSC_VER) || defined(_WIN32)
+    HANDLE dictHandle;
+#endif
+} FIO_Dict_t;
+
+#endif /* FILEIO_TYPES_HEADER */
--- a/third_party/zstd/programs/platform.h
+++ b/third_party/zstd/programs/platform.h
@ -0,0 +1,325 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef PLATFORM_H_MODULE
+#define PLATFORM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/* **************************************
+*  Compiler Options
+****************************************/
+#if defined(_MSC_VER)
+#  define _CRT_SECURE_NO_WARNINGS    /* Disable Visual Studio warning messages for fopen, strncpy, strerror */
+#  define _CRT_NONSTDC_NO_WARNINGS   /* Disable C4996 complaining about posix function names */
+#  if (_MSC_VER <= 1800)             /* 1800 == Visual Studio 2013 */
+#    define _CRT_SECURE_NO_DEPRECATE /* VS2005 - must be declared before <io.h> and <windows.h> */
+#    define snprintf sprintf_s       /* snprintf unsupported by Visual <= 2013 */
+#  endif
+#  pragma warning(disable : 4127)    /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************
+*  Detect 64-bit OS
+*  https://nadeausoftware.com/articles/2012/02/c_c_tip_how_detect_processor_type_using_compiler_predefined_macros
+****************************************/
+#if defined __ia64 || defined _M_IA64                                                                               /* Intel Itanium */ \
+  || defined __powerpc64__ || defined __ppc64__ || defined __PPC64__                                                /* POWER 64-bit */  \
+  || (defined __sparc && (defined __sparcv9 || defined __sparc_v9__ || defined __arch64__)) || defined __sparc64__  /* SPARC 64-bit */  \
+  || defined __x86_64__s || defined _M_X64                                                                          /* x86 64-bit */    \
+  || defined __arm64__ || defined __aarch64__ || defined __ARM64_ARCH_8__                                           /* ARM 64-bit */    \
+  || (defined __mips  && (__mips == 64 || __mips == 4 || __mips == 3))                                              /* MIPS 64-bit */   \
+  || defined _LP64 || defined __LP64__ /* NetBSD, OpenBSD */ || defined __64BIT__ /* AIX */ || defined _ADDR64 /* Cray */               \
+  || (defined __SIZEOF_POINTER__ && __SIZEOF_POINTER__ == 8) /* gcc */
+#  if !defined(__64BIT__)
+#    define __64BIT__  1
+#  endif
+#endif
+
+
+/* *********************************************************
+*  Turn on Large Files support (>4GB) for 32-bit Linux/Unix
+***********************************************************/
+#if !defined(__64BIT__) || defined(__MINGW32__)    /* No point defining Large file for 64 bit but MinGW-w64 requires it */
+#  if !defined(_FILE_OFFSET_BITS)
+#    define _FILE_OFFSET_BITS 64                   /* turn off_t into a 64-bit type for ftello, fseeko */
+#  endif
+#  if !defined(_LARGEFILE_SOURCE)                  /* obsolete macro, replaced with _FILE_OFFSET_BITS */
+#    define _LARGEFILE_SOURCE 1                    /* Large File Support extension (LFS) - fseeko, ftello */
+#  endif
+#  if defined(_AIX) || defined(__hpux)
+#    define _LARGE_FILES                           /* Large file support on 32-bits AIX and HP-UX */
+#  endif
+#endif
+
+
+/* ************************************************************
+*  Detect POSIX version
+*  PLATFORM_POSIX_VERSION = 0 for non-Unix e.g. Windows
+*  PLATFORM_POSIX_VERSION = 1 for Unix-like but non-POSIX
+*  PLATFORM_POSIX_VERSION > 1 is equal to found _POSIX_VERSION
+*  Value of PLATFORM_POSIX_VERSION can be forced on command line
+***************************************************************/
+#ifndef PLATFORM_POSIX_VERSION
+
+#  if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1-2001 (SUSv3) conformant */ \
+     || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)  /* BSD distros */
+     /* exception rule : force posix version to 200112L,
+      * note: it's better to use unistd.h's _POSIX_VERSION whenever possible */
+#    define PLATFORM_POSIX_VERSION 200112L
+
+/* try to determine posix version through official unistd.h's _POSIX_VERSION (https://pubs.opengroup.org/onlinepubs/7908799/xsh/unistd.h.html).
+ * note : there is no simple way to know in advance if <unistd.h> is present or not on target system,
+ * Posix specification mandates its presence and its content, but target system must respect this spec.
+ * It's necessary to _not_ #include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h" whenever target OS is not unix-like
+ * otherwise it will block preprocessing stage.
+ * The following list of build macros tries to "guess" if target OS is likely unix-like, and therefore can #include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+ */
+#  elif !defined(_WIN32) \
+     && ( defined(__unix__) || defined(__unix) \
+       || defined(__midipix__) || defined(__VMS) || defined(__HAIKU__) )
+
+#    if defined(__linux__) || defined(__linux) || defined(__CYGWIN__)
+#      ifndef _POSIX_C_SOURCE
+#        define _POSIX_C_SOURCE 200809L  /* feature test macro : https://www.gnu.org/software/libc/manual/html_node/Feature-Test-Macros.html */
+#      endif
+#    endif
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"  /* declares _POSIX_VERSION */
+#    if defined(_POSIX_VERSION)  /* POSIX compliant */
+#      define PLATFORM_POSIX_VERSION _POSIX_VERSION
+#    else
+#      define PLATFORM_POSIX_VERSION 1
+#    endif
+
+#    ifdef __UCLIBC__
+#     ifndef __USE_MISC
+#      define __USE_MISC /* enable st_mtim on uclibc */
+#     endif
+#    endif
+
+#  else  /* non-unix target platform (like Windows) */
+#    define PLATFORM_POSIX_VERSION 0
+#  endif
+
+#endif   /* PLATFORM_POSIX_VERSION */
+
+
+#if PLATFORM_POSIX_VERSION > 1
+   /* glibc < 2.26 may not expose struct timespec def without this.
+    * See issue #1920. */
+#  ifndef _ATFILE_SOURCE
+#    define _ATFILE_SOURCE
+#  endif
+#endif
+
+
+/*-*********************************************
+*  Detect if isatty() and fileno() are available
+*
+*  Note: Use UTIL_isConsole() for the zstd CLI
+*  instead, as it allows faking is console for
+*  testing.
+************************************************/
+#if (defined(__linux__) && (PLATFORM_POSIX_VERSION > 1)) \
+ || (PLATFORM_POSIX_VERSION >= 200112L) \
+ || defined(__DJGPP__)
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"   /* isatty */
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"    /* fileno */
+#  define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
+#elif defined(MSDOS) || defined(OS2)
+// MISSING #include <io.h>       /* _isatty */
+#  define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
+#elif defined(WIN32) || defined(_WIN32)
+// MISSING #include <io.h>      /* _isatty */
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h" /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"   /* FILE */
+static __inline int IS_CONSOLE(FILE* stdStream) {
+    DWORD dummy;
+    return _isatty(_fileno(stdStream)) && GetConsoleMode((HANDLE)_get_osfhandle(_fileno(stdStream)), &dummy);
+}
+#else
+#  define IS_CONSOLE(stdStream) 0
+#endif
+
+
+/******************************
+*  OS-specific IO behaviors
+******************************/
+#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32)
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/flock.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/at.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fd.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/posix.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/splice.h"   /* _O_BINARY */
+// MISSING #include <io.h>      /* _setmode, _fileno, _get_osfhandle */
+#  if !defined(__DJGPP__)
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h" /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */
+// MISSING #include <winioctl.h> /* FSCTL_SET_SPARSE */
+#    define SET_BINARY_MODE(file) { int const unused=_setmode(_fileno(file), _O_BINARY); (void)unused; }
+#    define SET_SPARSE_FILE_MODE(file) { DWORD dw; DeviceIoControl((HANDLE) _get_osfhandle(_fileno(file)), FSCTL_SET_SPARSE, 0, 0, 0, 0, &dw, 0); }
+#  else
+#    define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
+#    define SET_SPARSE_FILE_MODE(file)
+#  endif
+#else
+#  define SET_BINARY_MODE(file)
+#  define SET_SPARSE_FILE_MODE(file)
+#endif
+
+
+#ifndef ZSTD_SPARSE_DEFAULT
+#  if (defined(__APPLE__) && defined(__MACH__))
+#    define ZSTD_SPARSE_DEFAULT 0
+#  else
+#    define ZSTD_SPARSE_DEFAULT 1
+#  endif
+#endif
+
+
+#ifndef ZSTD_START_SYMBOLLIST_FRAME
+#  ifdef __linux__
+#    define ZSTD_START_SYMBOLLIST_FRAME 2
+#  elif defined __APPLE__
+#    define ZSTD_START_SYMBOLLIST_FRAME 4
+#  else
+#    define ZSTD_START_SYMBOLLIST_FRAME 0
+#  endif
+#endif
+
+
+#ifndef ZSTD_SETPRIORITY_SUPPORT
+   /* mandates presence of <sys/resource.h> and support for setpriority() : https://man7.org/linux/man-pages/man2/setpriority.2.html */
+#  define ZSTD_SETPRIORITY_SUPPORT (PLATFORM_POSIX_VERSION >= 200112L)
+#endif
+
+
+#ifndef ZSTD_NANOSLEEP_SUPPORT
+   /* mandates support of nanosleep() within <time.h> : https://man7.org/linux/man-pages/man2/nanosleep.2.html */
+#  if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 199309L)) \
+   || (PLATFORM_POSIX_VERSION >= 200112L)
+#     define ZSTD_NANOSLEEP_SUPPORT 1
+#  else
+#     define ZSTD_NANOSLEEP_SUPPORT 0
+#  endif
+#endif
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* PLATFORM_H_MODULE */
--- a/third_party/zstd/programs/timefn.c
+++ b/third_party/zstd/programs/timefn.c
@ -0,0 +1,258 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ===  Dependencies  === */
+
+#include "third_party/zstd/programs/timefn.h"
+#include "third_party/zstd/programs/platform.h" /* set _POSIX_C_SOURCE */
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/struct/timeval.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/sysv/consts/sched.h"
+#include "libc/sysv/consts/timer.h"
+#include "libc/time/struct/tm.h"
+#include "libc/time/time.h"     /* CLOCK_MONOTONIC, TIME_UTC */
+
+/*-****************************************
+*  Time functions
+******************************************/
+
+#if defined(_WIN32)   /* Windows */
+
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"  /* LARGE_INTEGER */
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"   /* abort */
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"    /* perror */
+
+UTIL_time_t UTIL_getTime(void)
+{
+    static LARGE_INTEGER ticksPerSecond;
+    static int init = 0;
+    if (!init) {
+        if (!QueryPerformanceFrequency(&ticksPerSecond)) {
+            perror("timefn::QueryPerformanceFrequency");
+            abort();
+        }
+        init = 1;
+    }
+    {   UTIL_time_t r;
+        LARGE_INTEGER x;
+        QueryPerformanceCounter(&x);
+        r.t = (PTime)(x.QuadPart * 1000000000ULL / ticksPerSecond.QuadPart);
+        return r;
+    }
+}
+
+
+#elif defined(__APPLE__) && defined(__MACH__)
+
+// MISSING #include <mach/mach_time.h> /* mach_timebase_info_data_t, mach_timebase_info, mach_absolute_time */
+
+UTIL_time_t UTIL_getTime(void)
+{
+    static mach_timebase_info_data_t rate;
+    static int init = 0;
+    if (!init) {
+        mach_timebase_info(&rate);
+        init = 1;
+    }
+    {   UTIL_time_t r;
+        r.t = mach_absolute_time() * (PTime)rate.numer / (PTime)rate.denom;
+        return r;
+    }
+}
+
+/* POSIX.1-2001 (optional) */
+#elif defined(CLOCK_MONOTONIC)
+
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"   /* abort */
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"    /* perror */
+
+UTIL_time_t UTIL_getTime(void)
+{
+    /* time must be initialized, othersize it may fail msan test.
+     * No good reason, likely a limitation of timespec_get() for some target */
+    struct timespec time = { 0, 0 };
+    if (clock_gettime(CLOCK_MONOTONIC, &time) != 0) {
+        perror("timefn::clock_gettime(CLOCK_MONOTONIC)");
+        abort();
+    }
+    {   UTIL_time_t r;
+        r.t = (PTime)time.tv_sec * 1000000000ULL + (PTime)time.tv_nsec;
+        return r;
+    }
+}
+
+
+/* C11 requires support of timespec_get().
+ * However, FreeBSD 11 claims C11 compliance while lacking timespec_get().
+ * Double confirm timespec_get() support by checking the definition of TIME_UTC.
+ * However, some versions of Android manage to simultaneously define TIME_UTC
+ * and lack timespec_get() support... */
+#elif (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */) \
+    && defined(TIME_UTC) && !defined(__ANDROID__)
+
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"   /* abort */
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"    /* perror */
+
+UTIL_time_t UTIL_getTime(void)
+{
+    /* time must be initialized, othersize it may fail msan test.
+     * No good reason, likely a limitation of timespec_get() for some target */
+    struct timespec time = { 0, 0 };
+    if (timespec_get(&time, TIME_UTC) != TIME_UTC) {
+        perror("timefn::timespec_get(TIME_UTC)");
+        abort();
+    }
+    {   UTIL_time_t r;
+        r.t = (PTime)time.tv_sec * 1000000000ULL + (PTime)time.tv_nsec;
+        return r;
+    }
+}
+
+
+#else   /* relies on standard C90 (note : clock_t produces wrong measurements for multi-threaded workloads) */
+
+UTIL_time_t UTIL_getTime(void)
+{
+    UTIL_time_t r;
+    r.t = (PTime)clock() * 1000000000ULL / CLOCKS_PER_SEC;
+    return r;
+}
+
+#define TIME_MT_MEASUREMENTS_NOT_SUPPORTED
+
+#endif
+
+/* ==== Common functions, valid for all time API ==== */
+
+PTime UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd)
+{
+    return clockEnd.t - clockStart.t;
+}
+
+PTime UTIL_getSpanTimeMicro(UTIL_time_t begin, UTIL_time_t end)
+{
+    return UTIL_getSpanTimeNano(begin, end) / 1000ULL;
+}
+
+PTime UTIL_clockSpanMicro(UTIL_time_t clockStart )
+{
+    UTIL_time_t const clockEnd = UTIL_getTime();
+    return UTIL_getSpanTimeMicro(clockStart, clockEnd);
+}
+
+PTime UTIL_clockSpanNano(UTIL_time_t clockStart )
+{
+    UTIL_time_t const clockEnd = UTIL_getTime();
+    return UTIL_getSpanTimeNano(clockStart, clockEnd);
+}
+
+void UTIL_waitForNextTick(void)
+{
+    UTIL_time_t const clockStart = UTIL_getTime();
+    UTIL_time_t clockEnd;
+    do {
+        clockEnd = UTIL_getTime();
+    } while (UTIL_getSpanTimeNano(clockStart, clockEnd) == 0);
+}
+
+int UTIL_support_MT_measurements(void)
+{
+# if defined(TIME_MT_MEASUREMENTS_NOT_SUPPORTED)
+    return 0;
+# else
+    return 1;
+# endif
+}
--- a/third_party/zstd/programs/timefn.h
+++ b/third_party/zstd/programs/timefn.h
@ -0,0 +1,76 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef TIME_FN_H_MODULE_287987
+#define TIME_FN_H_MODULE_287987
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+
+/*-****************************************
+*  Types
+******************************************/
+
+#if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# if defined(_AIX)
+#include "libc/fmt/conv.h"
+#include "libc/inttypes.h"
+#include "libc/limits.h"
+#include "libc/literal.h"
+# else
+#include "libc/inttypes.h"
+#include "libc/limits.h"
+#include "libc/literal.h" /* uint64_t */
+# endif
+  typedef uint64_t           PTime;  /* Precise Time */
+#else
+  typedef unsigned long long PTime;  /* does not support compilers without long long support */
+#endif
+
+/* UTIL_time_t contains a nanosecond time counter.
+ * The absolute value is not meaningful.
+ * It's only valid to compute the difference between 2 measurements. */
+typedef struct { PTime t; } UTIL_time_t;
+#define UTIL_TIME_INITIALIZER { 0 }
+
+
+/*-****************************************
+*  Time functions
+******************************************/
+
+UTIL_time_t UTIL_getTime(void);
+
+/* Timer resolution can be low on some platforms.
+ * To improve accuracy, it's recommended to wait for a new tick
+ * before starting benchmark measurements */
+void UTIL_waitForNextTick(void);
+/* tells if timefn will return correct time measurements
+ * in presence of multi-threaded workload.
+ * note : this is not the case if only C90 clock_t measurements are available */
+int UTIL_support_MT_measurements(void);
+
+PTime UTIL_getSpanTimeNano(UTIL_time_t clockStart, UTIL_time_t clockEnd);
+PTime UTIL_clockSpanNano(UTIL_time_t clockStart);
+
+PTime UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd);
+PTime UTIL_clockSpanMicro(UTIL_time_t clockStart);
+
+#define SEC_TO_MICRO ((PTime)1000000)  /* nb of microseconds in a second */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* TIME_FN_H_MODULE_287987 */
--- a/third_party/zstd/programs/util.c
+++ b/third_party/zstd/programs/util.c
--- a/third_party/zstd/programs/util.h
+++ b/third_party/zstd/programs/util.h
@ -0,0 +1,411 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef UTIL_H_MODULE
+#define UTIL_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include "third_party/zstd/programs/platform.h"     /* PLATFORM_POSIX_VERSION, ZSTD_NANOSLEEP_SUPPORT, ZSTD_SETPRIORITY_SUPPORT */
+       /* size_t, ptrdiff_t */
+#include "libc/calls/makedev.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/thread/thread.h"
+#include "libc/calls/typedef/u.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/intrin/newbie.h"
+#include "libc/sock/select.h"
+#include "libc/sysv/consts/endian.h"    /* stat, utime */
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/stat.h"
+#include "libc/calls/struct/stat.macros.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/utime.h"
+#include "libc/time/time.h"     /* stat, chmod */
+#include "third_party/zstd/lib/common/mem.h"          /* U64 */
+
+
+/*-************************************************************
+* Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW
+***************************************************************/
+#if defined(_MSC_VER) && (_MSC_VER >= 1400)
+#  define UTIL_fseek _fseeki64
+#elif !defined(__64BIT__) && (PLATFORM_POSIX_VERSION >= 200112L) /* No point defining Large file for 64 bit */
+#  define UTIL_fseek fseeko
+#elif defined(__MINGW32__) && defined(__MSVCRT__) && !defined(__STRICT_ANSI__) && !defined(__NO_MINGW_LFS)
+#  define UTIL_fseek fseeko64
+#else
+#  define UTIL_fseek fseek
+#endif
+
+
+/*-*************************************************
+*  Sleep & priority functions: Windows - Posix - others
+***************************************************/
+#if defined(_WIN32)
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#  define SET_REALTIME_PRIORITY SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS)
+#  define UTIL_sleep(s) Sleep(1000*s)
+#  define UTIL_sleepMilli(milli) Sleep(milli)
+
+#elif PLATFORM_POSIX_VERSION > 0 /* Unix-like operating system */
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"   /* sleep */
+#  define UTIL_sleep(s) sleep(s)
+#  if ZSTD_NANOSLEEP_SUPPORT   /* necessarily defined in platform.h */
+#      define UTIL_sleepMilli(milli) { struct timespec t; t.tv_sec=0; t.tv_nsec=milli*1000000ULL; nanosleep(&t, NULL); }
+#  else
+#      define UTIL_sleepMilli(milli) /* disabled */
+#  endif
+#  if ZSTD_SETPRIORITY_SUPPORT
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/rlimit.h"
+#include "libc/calls/struct/rusage.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/prio.h"
+#include "libc/sysv/consts/rlim.h"
+#include "libc/sysv/consts/rlimit.h"
+#include "libc/sysv/consts/rusage.h" /* setpriority */
+#    define SET_REALTIME_PRIORITY setpriority(PRIO_PROCESS, 0, -20)
+#  else
+#    define SET_REALTIME_PRIORITY /* disabled */
+#  endif
+
+#else  /* unknown non-unix operating system */
+#  define UTIL_sleep(s)          /* disabled */
+#  define UTIL_sleepMilli(milli) /* disabled */
+#  define SET_REALTIME_PRIORITY  /* disabled */
+#endif
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(__INTEL_COMPILER)
+#  pragma warning(disable : 177)    /* disable: message #177: function was declared but never referenced, useful with UTIL_STATIC */
+#endif
+#if defined(__GNUC__)
+#  define UTIL_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define UTIL_STATIC static inline
+#elif defined(_MSC_VER)
+#  define UTIL_STATIC static __inline
+#else
+#  define UTIL_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-****************************************
+*  Console log
+******************************************/
+extern int g_utilDisplayLevel;
+
+/**
+ * Displays a message prompt and returns success (0) if first character from stdin
+ * matches any from acceptableLetters. Otherwise, returns failure (1) and displays abortMsg.
+ * If any of the inputs are stdin itself, then automatically return failure (1).
+ */
+int UTIL_requireUserConfirmation(const char* prompt, const char* abortMsg, const char* acceptableLetters, int hasStdinInput);
+
+
+/*-****************************************
+*  File functions
+******************************************/
+#if defined(_MSC_VER)
+    typedef struct __stat64 stat_t;
+    typedef int mode_t;
+#elif defined(__MINGW32__) && defined (__MSVCRT__)
+    typedef struct _stati64 stat_t;
+#else
+    typedef struct stat stat_t;
+#endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__) || defined (__MSVCRT__) /* windows support */
+#define PATH_SEP '\\'
+#define STRDUP(s) _strdup(s)
+#else
+#define PATH_SEP '/'
+#include "libc/fmt/libgen.h"
+#define STRDUP(s) strdup(s)
+#endif
+
+
+/**
+ * Calls platform's equivalent of stat() on filename and writes info to statbuf.
+ * Returns success (1) or failure (0).
+ *
+ * UTIL_fstat() is like UTIL_stat() but takes an optional fd that refers to the
+ * file in question. It turns out that this can be meaningfully faster. If fd is
+ * -1, behaves just like UTIL_stat() (i.e., falls back to using the filename).
+ */
+int UTIL_stat(const char* filename, stat_t* statbuf);
+int UTIL_fstat(const int fd, const char* filename, stat_t* statbuf);
+
+/**
+ * Instead of getting a file's stats, this updates them with the info in the
+ * provided stat_t. Currently sets owner, group, atime, and mtime. Will only
+ * update this info for regular files.
+ *
+ * UTIL_setFDStat() also takes an fd, and will preferentially use that to
+ * indicate which file to modify, If fd is -1, it will fall back to using the
+ * filename.
+ */
+int UTIL_setFileStat(const char* filename, const stat_t* statbuf);
+int UTIL_setFDStat(const int fd, const char* filename, const stat_t* statbuf);
+
+/**
+ * Set atime to now and mtime to the st_mtim in statbuf.
+ *
+ * Directly wraps utime() or utimensat(). Returns -1 on error.
+ * Does not validate filename is valid.
+ */
+int UTIL_utime(const char* filename, const stat_t *statbuf);
+
+/*
+ * These helpers operate on a pre-populated stat_t, i.e., the result of
+ * calling one of the above functions.
+ */
+
+int UTIL_isRegularFileStat(const stat_t* statbuf);
+int UTIL_isDirectoryStat(const stat_t* statbuf);
+int UTIL_isFIFOStat(const stat_t* statbuf);
+int UTIL_isBlockDevStat(const stat_t* statbuf);
+U64 UTIL_getFileSizeStat(const stat_t* statbuf);
+
+/**
+ * Like chmod(), but only modifies regular files. Provided statbuf may be NULL,
+ * in which case this function will stat() the file internally, in order to
+ * check whether it should be modified.
+ *
+ * If fd is -1, fd is ignored and the filename is used.
+ */
+int UTIL_chmod(char const* filename, const stat_t* statbuf, mode_t permissions);
+int UTIL_fchmod(const int fd, char const* filename, const stat_t* statbuf, mode_t permissions);
+
+/*
+ * In the absence of a pre-existing stat result on the file in question, these
+ * functions will do a stat() call internally and then use that result to
+ * compute the needed information.
+ */
+
+int UTIL_isRegularFile(const char* infilename);
+int UTIL_isDirectory(const char* infilename);
+int UTIL_isSameFile(const char* file1, const char* file2);
+int UTIL_isSameFileStat(const char* file1, const char* file2, const stat_t* file1Stat, const stat_t* file2Stat);
+int UTIL_isCompressedFile(const char* infilename, const char *extensionList[]);
+int UTIL_isLink(const char* infilename);
+int UTIL_isFIFO(const char* infilename);
+
+/**
+ * Returns with the given file descriptor is a console.
+ * Allows faking whether stdin/stdout/stderr is a console
+ * using UTIL_fake*IsConsole().
+ */
+int UTIL_isConsole(FILE* file);
+
+/**
+ * Pretends that stdin/stdout/stderr is a console for testing.
+ */
+void UTIL_fakeStdinIsConsole(void);
+void UTIL_fakeStdoutIsConsole(void);
+void UTIL_fakeStderrIsConsole(void);
+
+/**
+ * Emit traces for functions that read, or modify file metadata.
+ */
+void UTIL_traceFileStat(void);
+
+#define UTIL_FILESIZE_UNKNOWN  ((U64)(-1))
+U64 UTIL_getFileSize(const char* infilename);
+U64 UTIL_getTotalFileSize(const char* const * fileNamesTable, unsigned nbFiles);
+
+/**
+ * Take @size in bytes,
+ * prepare the components to pretty-print it in a scaled way.
+ * The components in the returned struct should be passed in
+ * precision, value, suffix order to a "%.*f%s" format string.
+ * Output policy is sensible to @g_utilDisplayLevel,
+ * for verbose mode (@g_utilDisplayLevel >= 4),
+ * does not scale down.
+ */
+typedef struct {
+  double value;
+  int precision;
+  const char* suffix;
+} UTIL_HumanReadableSize_t;
+
+UTIL_HumanReadableSize_t UTIL_makeHumanReadableSize(U64 size);
+
+int UTIL_compareStr(const void *p1, const void *p2);
+const char* UTIL_getFileExtension(const char* infilename);
+void  UTIL_mirrorSourceFilesDirectories(const char** fileNamesTable, unsigned int nbFiles, const char *outDirName);
+char* UTIL_createMirroredDestDirName(const char* srcFileName, const char* outDirRootName);
+
+
+
+/*-****************************************
+ *  Lists of Filenames
+ ******************************************/
+
+typedef struct
+{   const char** fileNames;
+    char* buf;            /* fileNames are stored in this buffer (or are read-only) */
+    size_t tableSize;     /* nb of fileNames */
+    size_t tableCapacity;
+} FileNamesTable;
+
+/*! UTIL_createFileNamesTable_fromFileName() :
+ *  read filenames from @inputFileName, and store them into returned object.
+ * @return : a FileNamesTable*, or NULL in case of error (ex: @inputFileName doesn't exist).
+ *  Note: inputFileSize must be less than 50MB
+ */
+FileNamesTable*
+UTIL_createFileNamesTable_fromFileName(const char* inputFileName);
+
+/*! UTIL_assembleFileNamesTable() :
+ *  This function takes ownership of its arguments, @filenames and @buf,
+ *  and store them inside the created object.
+ *  note : this function never fails,
+ *         it will rather exit() the program if internal allocation fails.
+ * @return : resulting FileNamesTable* object.
+ */
+FileNamesTable*
+UTIL_assembleFileNamesTable(const char** filenames, size_t tableSize, char* buf);
+
+/*! UTIL_freeFileNamesTable() :
+ *  This function is compatible with NULL argument and never fails.
+ */
+void UTIL_freeFileNamesTable(FileNamesTable* table);
+
+/*! UTIL_mergeFileNamesTable():
+ * @return : FileNamesTable*, concatenation of @table1 and @table2
+ *  note: @table1 and @table2 are consumed (freed) by this operation
+ */
+FileNamesTable*
+UTIL_mergeFileNamesTable(FileNamesTable* table1, FileNamesTable* table2);
+
+
+/*! UTIL_expandFNT() :
+ *  read names from @fnt, and expand those corresponding to directories
+ *  update @fnt, now containing only file names,
+ *  note : in case of error, @fnt[0] is NULL
+ */
+void UTIL_expandFNT(FileNamesTable** fnt, int followLinks);
+
+/*! UTIL_createFNT_fromROTable() :
+ *  copy the @filenames pointer table inside the returned object.
+ *  The names themselves are still stored in their original buffer, which must outlive the object.
+ * @return : a FileNamesTable* object,
+ *        or NULL in case of error
+ */
+FileNamesTable*
+UTIL_createFNT_fromROTable(const char** filenames, size_t nbFilenames);
+
+/*! UTIL_allocateFileNamesTable() :
+ *  Allocates a table of const char*, to insert read-only names later on.
+ *  The created FileNamesTable* doesn't hold a buffer.
+ * @return : FileNamesTable*, or NULL, if allocation fails.
+ */
+FileNamesTable* UTIL_allocateFileNamesTable(size_t tableSize);
+
+/*! UTIL_searchFileNamesTable() :
+ *  Searched through entries in FileNamesTable for a specific name.
+ * @return : index of entry if found or -1 if not found
+ */
+int UTIL_searchFileNamesTable(FileNamesTable* table, char const* name);
+
+/*! UTIL_refFilename() :
+ *  Add a reference to read-only name into @fnt table.
+ *  As @filename is only referenced, its lifetime must outlive @fnt.
+ *  Internal table must be large enough to reference a new member,
+ *  otherwise its UB (protected by an `assert()`).
+ */
+void UTIL_refFilename(FileNamesTable* fnt, const char* filename);
+
+
+/* UTIL_createExpandedFNT() is only active if UTIL_HAS_CREATEFILELIST is defined.
+ * Otherwise, UTIL_createExpandedFNT() is a shell function which does nothing
+ * apart from displaying a warning message.
+ */
+#ifdef _WIN32
+#  define UTIL_HAS_CREATEFILELIST
+#elif defined(__linux__) || (PLATFORM_POSIX_VERSION >= 200112L)  /* opendir, readdir require POSIX.1-2001 */
+#  define UTIL_HAS_CREATEFILELIST
+#  define UTIL_HAS_MIRRORFILELIST
+#else
+   /* do not define UTIL_HAS_CREATEFILELIST */
+#endif
+
+/*! UTIL_createExpandedFNT() :
+ *  read names from @filenames, and expand those corresponding to directories.
+ *  links are followed or not depending on @followLinks directive.
+ * @return : an expanded FileNamesTable*, where each name is a file
+ *        or NULL in case of error
+ */
+FileNamesTable*
+UTIL_createExpandedFNT(const char* const* filenames, size_t nbFilenames, int followLinks);
+
+#if defined(_WIN32) || defined(WIN32)
+DWORD CountSetBits(ULONG_PTR bitMask);
+#endif
+
+/*-****************************************
+ *  System
+ ******************************************/
+
+int UTIL_countCores(int logical);
+
+int UTIL_countPhysicalCores(void);
+
+int UTIL_countLogicalCores(void);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* UTIL_H_MODULE */
--- a/third_party/zstd/programs/zstdcli.c
+++ b/third_party/zstd/programs/zstdcli.c
--- a/third_party/zstd/programs/zstdcli_trace.c
+++ b/third_party/zstd/programs/zstdcli_trace.c
@ -0,0 +1,195 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include "third_party/zstd/programs/zstdcli_trace.h"
+
+#include "libc/calls/calls.h"
+#include "libc/assert.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"
+
+#include "third_party/zstd/programs/timefn.h"
+#include "third_party/zstd/programs/util.h"
+
+#define ZSTD_STATIC_LINKING_ONLY
+#include "third_party/zstd/zstd.h"
+/* We depend on the trace header to avoid duplicating the ZSTD_trace struct.
+ * But, we check the version so it is compatible with dynamic linking.
+ */
+#include "third_party/zstd/lib/common/zstd_trace.h"
+/* We only use macros from threading.h so it is compatible with dynamic linking */
+#include "third_party/zstd/lib/common/threading.h"
+
+#if ZSTD_TRACE
+
+static FILE* g_traceFile = NULL;
+static int g_mutexInit = 0;
+static ZSTD_pthread_mutex_t g_mutex;
+static UTIL_time_t g_enableTime = UTIL_TIME_INITIALIZER;
+
+void TRACE_enable(char const* filename)
+{
+    int const writeHeader = !UTIL_isRegularFile(filename);
+    if (g_traceFile)
+        fclose(g_traceFile);
+    g_traceFile = fopen(filename, "a");
+    if (g_traceFile && writeHeader) {
+        /* Fields:
+        * algorithm
+        * version
+        * method
+        * streaming
+        * level
+        * workers
+        * dictionary size
+        * uncompressed size
+        * compressed size
+        * duration nanos
+        * compression ratio
+        * speed MB/s
+        */
+        fprintf(g_traceFile, "Algorithm, Version, Method, Mode, Level, Workers, Dictionary Size, Uncompressed Size, Compressed Size, Duration Nanos, Compression Ratio, Speed MB/s\n");
+    }
+    g_enableTime = UTIL_getTime();
+    if (!g_mutexInit) {
+        if (!ZSTD_pthread_mutex_init(&g_mutex, NULL)) {
+            g_mutexInit = 1;
+        } else {
+            TRACE_finish();
+        }
+    }
+}
+
+void TRACE_finish(void)
+{
+    if (g_traceFile) {
+        fclose(g_traceFile);
+    }
+    g_traceFile = NULL;
+    if (g_mutexInit) {
+        ZSTD_pthread_mutex_destroy(&g_mutex);
+        g_mutexInit = 0;
+    }
+}
+
+static void TRACE_log(char const* method, PTime duration, ZSTD_Trace const* trace)
+{
+    int level = 0;
+    int workers = 0;
+    double const ratio = (double)trace->uncompressedSize / (double)trace->compressedSize;
+    double const speed = ((double)trace->uncompressedSize * 1000) / (double)duration;
+    if (trace->params) {
+        ZSTD_CCtxParams_getParameter(trace->params, ZSTD_c_compressionLevel, &level);
+        ZSTD_CCtxParams_getParameter(trace->params, ZSTD_c_nbWorkers, &workers);
+    }
+    assert(g_traceFile != NULL);
+
+    ZSTD_pthread_mutex_lock(&g_mutex);
+    /* Fields:
+     * algorithm
+     * version
+     * method
+     * streaming
+     * level
+     * workers
+     * dictionary size
+     * uncompressed size
+     * compressed size
+     * duration nanos
+     * compression ratio
+     * speed MB/s
+     */
+    fprintf(g_traceFile,
+        "zstd, %u, %s, %s, %d, %d, %llu, %llu, %llu, %llu, %.2f, %.2f\n",
+        trace->version,
+        method,
+        trace->streaming ? "streaming" : "single-pass",
+        level,
+        workers,
+        (unsigned long long)trace->dictionarySize,
+        (unsigned long long)trace->uncompressedSize,
+        (unsigned long long)trace->compressedSize,
+        (unsigned long long)duration,
+        ratio,
+        speed);
+    ZSTD_pthread_mutex_unlock(&g_mutex);
+}
+
+/**
+ * These symbols override the weak symbols provided by the library.
+ */
+
+ZSTD_TraceCtx ZSTD_trace_compress_begin(ZSTD_CCtx const* cctx)
+{
+    (void)cctx;
+    if (g_traceFile == NULL)
+        return 0;
+    return (ZSTD_TraceCtx)UTIL_clockSpanNano(g_enableTime);
+}
+
+void ZSTD_trace_compress_end(ZSTD_TraceCtx ctx, ZSTD_Trace const* trace)
+{
+    PTime const beginNanos = (PTime)ctx;
+    PTime const endNanos = UTIL_clockSpanNano(g_enableTime);
+    PTime const durationNanos = endNanos > beginNanos ? endNanos - beginNanos : 0;
+    assert(g_traceFile != NULL);
+    assert(trace->version == ZSTD_VERSION_NUMBER); /* CLI version must match. */
+    TRACE_log("compress", durationNanos, trace);
+}
+
+ZSTD_TraceCtx ZSTD_trace_decompress_begin(ZSTD_DCtx const* dctx)
+{
+    (void)dctx;
+    if (g_traceFile == NULL)
+        return 0;
+    return (ZSTD_TraceCtx)UTIL_clockSpanNano(g_enableTime);
+}
+
+void ZSTD_trace_decompress_end(ZSTD_TraceCtx ctx, ZSTD_Trace const* trace)
+{
+    PTime const beginNanos = (PTime)ctx;
+    PTime const endNanos = UTIL_clockSpanNano(g_enableTime);
+    PTime const durationNanos = endNanos > beginNanos ? endNanos - beginNanos : 0;
+    assert(g_traceFile != NULL);
+    assert(trace->version == ZSTD_VERSION_NUMBER); /* CLI version must match. */
+    TRACE_log("decompress", durationNanos, trace);
+}
+
+#else /* ZSTD_TRACE */
+
+void TRACE_enable(char const* filename)
+{
+    (void)filename;
+}
+
+void TRACE_finish(void) {}
+
+#endif /* ZSTD_TRACE */
--- a/third_party/zstd/programs/zstdcli_trace.h
+++ b/third_party/zstd/programs/zstdcli_trace.h
@ -0,0 +1,25 @@
+// clang-format off
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDCLI_TRACE_H
+#define ZSTDCLI_TRACE_H
+
+/**
+ * Enable tracing - log to filename.
+ */
+void TRACE_enable(char const* filename);
+
+/**
+ * Shut down the tracing library.
+ */
+void TRACE_finish(void);
+
+#endif /* ZSTDCLI_TRACE_H */
--- a/Show more
+++ b/Show more