Make AARCH64 harder, better, faster, stronger

- Perform some housekeeping on scalar math function code
- Import ARM's Optimized Routines for SIMD string processing
- Upgrade to latest Chromium zlib and enable more SIMD optimizations
This commit is contained in:
Justine Tunney 2023-05-15 01:51:29 -07:00
parent 550b52abf6
commit cc1732bc42
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
143 changed files with 15661 additions and 1329 deletions

View file

@ -14,11 +14,6 @@
#include "libc/stdio/temp.h"
#include "libc/str/str.h"
#include "third_party/zlib/deflate.internal.h"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/*
@ -174,7 +169,7 @@ local void gen_trees_header OF((void));
#else /* !ZLIB_DEBUG */
# define send_code(s, c, tree) \
{ if (z_verbose>2) kprintf("\ncd %3d ",(c)); \
{ if (z_verbose>2) kprintf("\ncd %3d ",(c)); \
send_bits(s, tree[c].Code, tree[c].Len); }
#endif
@ -204,7 +199,7 @@ local void send_bits(s, value, length)
s->bits_sent += (ulg)length;
/* If not enough room in bi_buf, use (valid) bits from bi_buf and
* (16 - bi_valid) bits from value, leaving (width - (16-bi_valid))
* (16 - bi_valid) bits from value, leaving (width - (16 - bi_valid))
* unused bits in value.
*/
if (s->bi_valid > (int)Buf_size - length) {
@ -267,7 +262,7 @@ local void tr_static_init()
length = 0;
for (code = 0; code < LENGTH_CODES-1; code++) {
base_length[code] = length;
for (n = 0; n < (1<<extra_lbits[code]); n++) {
for (n = 0; n < (1 << extra_lbits[code]); n++) {
_length_code[length++] = (uch)code;
}
}
@ -276,13 +271,13 @@ local void tr_static_init()
* in two different ways: code 284 + 5 bits or code 285, so we
* overwrite length_code[255] to use the best encoding:
*/
_length_code[length-1] = (uch)code;
_length_code[length - 1] = (uch)code;
/* Initialize the mapping dist (0..32K) -> dist code (0..29) */
dist = 0;
for (code = 0 ; code < 16; code++) {
base_dist[code] = dist;
for (n = 0; n < (1<<extra_dbits[code]); n++) {
for (n = 0; n < (1 << extra_dbits[code]); n++) {
_dist_code[dist++] = (uch)code;
}
}
@ -290,11 +285,11 @@ local void tr_static_init()
dist >>= 7; /* from now on, all distances are divided by 128 */
for ( ; code < D_CODES; code++) {
base_dist[code] = dist << 7;
for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) {
for (n = 0; n < (1 << (extra_dbits[code] - 7)); n++) {
_dist_code[256 + dist++] = (uch)code;
}
}
Assert (dist == 256, "tr_static_init: 256+dist != 512");
Assert (dist == 256, "tr_static_init: 256 + dist != 512");
/* Construct the codes of the static literal tree */
for (bits = 0; bits <= MAX_BITS; bits++) bl_count[bits] = 0;
@ -327,11 +322,12 @@ local void tr_static_init()
*/
#ifdef GEN_TREES_H
# ifndef ZLIB_DEBUG
//# include <stdio.h>
# endif
# define SEPARATOR(i, last, width) \
((i) == (last)? "\n};\n\n" : \
((i) % (width) == (width)-1 ? ",\n" : ", "))
((i) % (width) == (width) - 1 ? ",\n" : ", "))
void gen_trees_header()
{
@ -468,7 +464,7 @@ local void pqdownheap(s, tree, k)
while (j <= s->heap_len) {
/* Set j to the smallest of the two sons: */
if (j < s->heap_len &&
smaller(tree, s->heap[j+1], s->heap[j], s->depth)) {
smaller(tree, s->heap[j + 1], s->heap[j], s->depth)) {
j++;
}
/* Exit if v is smaller than both sons */
@ -517,7 +513,7 @@ local void gen_bitlen(s, desc)
*/
tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */
for (h = s->heap_max+1; h < HEAP_SIZE; h++) {
for (h = s->heap_max + 1; h < HEAP_SIZE; h++) {
n = s->heap[h];
bits = tree[tree[n].Dad].Len + 1;
if (bits > max_length) bits = max_length, overflow++;
@ -528,7 +524,7 @@ local void gen_bitlen(s, desc)
s->bl_count[bits]++;
xbits = 0;
if (n >= base) xbits = extra[n-base];
if (n >= base) xbits = extra[n - base];
f = tree[n].Freq;
s->opt_len += (ulg)f * (unsigned)(bits + xbits);
if (stree) s->static_len += (ulg)f * (unsigned)(stree[n].Len + xbits);
@ -540,10 +536,10 @@ local void gen_bitlen(s, desc)
/* Find the first bit length which could increase: */
do {
bits = max_length-1;
bits = max_length - 1;
while (s->bl_count[bits] == 0) bits--;
s->bl_count[bits]--; /* move one leaf down the tree */
s->bl_count[bits+1] += 2; /* move one overflow item as its brother */
s->bl_count[bits]--; /* move one leaf down the tree */
s->bl_count[bits + 1] += 2; /* move one overflow item as its brother */
s->bl_count[max_length]--;
/* The brother of the overflow item also moves one step up,
* but this does not affect bl_count[max_length]
@ -579,7 +575,7 @@ local void gen_bitlen(s, desc)
* OUT assertion: the field code is set for all tree elements of non
* zero code length.
*/
local void gen_codes (tree, max_code, bl_count)
local void gen_codes(tree, max_code, bl_count)
ct_data *tree; /* the tree to decorate */
int max_code; /* largest code with non zero frequency */
ushf *bl_count; /* number of codes at each bit length */
@ -593,13 +589,13 @@ local void gen_codes (tree, max_code, bl_count)
* without bit reversal.
*/
for (bits = 1; bits <= MAX_BITS; bits++) {
code = (code + bl_count[bits-1]) << 1;
code = (code + bl_count[bits - 1]) << 1;
next_code[bits] = (ush)code;
}
/* Check that the bit counts in bl_count are consistent. The last code
* must be all ones.
*/
Assert (code + bl_count[MAX_BITS]-1 == (1<<MAX_BITS)-1,
Assert (code + bl_count[MAX_BITS] - 1 == (1 << MAX_BITS) - 1,
"inconsistent bit counts");
Tracev(("\ngen_codes: max_code %d ", max_code));
@ -610,7 +606,7 @@ local void gen_codes (tree, max_code, bl_count)
tree[n].Code = (ush)bi_reverse(next_code[len]++, len);
Tracecv(tree != static_ltree, ("\nn %3d %c l %2d c %4x (%x) ",
n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len] - 1));
}
}
@ -634,7 +630,7 @@ local void build_tree(s, desc)
int node; /* new node being created */
/* Construct the initial heap, with least frequent element in
* heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1].
* heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n + 1].
* heap[0] is not used.
*/
s->heap_len = 0, s->heap_max = HEAP_SIZE;
@ -662,7 +658,7 @@ local void build_tree(s, desc)
}
desc->max_code = max_code;
/* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree,
/* The elements heap[heap_len/2 + 1 .. heap_len] are leaves of the tree,
* establish sub-heaps of increasing lengths:
*/
for (n = s->heap_len/2; n >= 1; n--) pqdownheap(s, tree, n);
@ -710,7 +706,7 @@ local void build_tree(s, desc)
* Scan a literal or distance tree to determine the frequencies of the codes
* in the bit length tree.
*/
local void scan_tree (s, tree, max_code)
local void scan_tree(s, tree, max_code)
deflate_state *s;
ct_data *tree; /* the tree to be scanned */
int max_code; /* and its largest code of non zero frequency */
@ -724,10 +720,10 @@ local void scan_tree (s, tree, max_code)
int min_count = 4; /* min repeat count */
if (nextlen == 0) max_count = 138, min_count = 3;
tree[max_code+1].Len = (ush)0xffff; /* guard */
tree[max_code + 1].Len = (ush)0xffff; /* guard */
for (n = 0; n <= max_code; n++) {
curlen = nextlen; nextlen = tree[n+1].Len;
curlen = nextlen; nextlen = tree[n + 1].Len;
if (++count < max_count && curlen == nextlen) {
continue;
} else if (count < min_count) {
@ -755,7 +751,7 @@ local void scan_tree (s, tree, max_code)
* Send a literal or distance tree in compressed form, using the codes in
* bl_tree.
*/
local void send_tree (s, tree, max_code)
local void send_tree(s, tree, max_code)
deflate_state *s;
ct_data *tree; /* the tree to be scanned */
int max_code; /* and its largest code of non zero frequency */
@ -768,11 +764,11 @@ local void send_tree (s, tree, max_code)
int max_count = 7; /* max repeat count */
int min_count = 4; /* min repeat count */
/* tree[max_code+1].Len = -1; */ /* guard already set */
/* tree[max_code + 1].Len = -1; */ /* guard already set */
if (nextlen == 0) max_count = 138, min_count = 3;
for (n = 0; n <= max_code; n++) {
curlen = nextlen; nextlen = tree[n+1].Len;
curlen = nextlen; nextlen = tree[n + 1].Len;
if (++count < max_count && curlen == nextlen) {
continue;
} else if (count < min_count) {
@ -783,13 +779,13 @@ local void send_tree (s, tree, max_code)
send_code(s, curlen, s->bl_tree); count--;
}
Assert(count >= 3 && count <= 6, " 3_6?");
send_code(s, REP_3_6, s->bl_tree); send_bits(s, count-3, 2);
send_code(s, REP_3_6, s->bl_tree); send_bits(s, count - 3, 2);
} else if (count <= 10) {
send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count-3, 3);
send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count - 3, 3);
} else {
send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count-11, 7);
send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count - 11, 7);
}
count = 0; prevlen = curlen;
if (nextlen == 0) {
@ -817,8 +813,8 @@ local int build_bl_tree(s)
/* Build the bit length tree: */
build_tree(s, (tree_desc *)(&(s->bl_desc)));
/* opt_len now includes the length of the tree representations, except
* the lengths of the bit lengths codes and the 5+5+4 bits for the counts.
/* opt_len now includes the length of the tree representations, except the
* lengths of the bit lengths codes and the 5 + 5 + 4 bits for the counts.
*/
/* Determine the number of bit length codes to send. The pkzip format
@ -829,8 +825,8 @@ local int build_bl_tree(s)
if (s->bl_tree[bl_order[max_blindex]].Len != 0) break;
}
/* Update opt_len to include the bit length tree and counts */
s->opt_len += 3*((ulg)max_blindex+1) + 5+5+4;
Tracev(("\ndyn trees: dyn %ld, stat %ld",
s->opt_len += 3*((ulg)max_blindex + 1) + 5 + 5 + 4;
Tracev(( "\ndyn trees: dyn %ld, stat %ld",
s->opt_len, s->static_len));
return max_blindex;
@ -850,21 +846,21 @@ local void send_all_trees(s, lcodes, dcodes, blcodes)
Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes");
Assert (lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES,
"too many codes");
Tracev(("\nbl counts: "));
send_bits(s, lcodes-257, 5); /* not +255 as stated in appnote.txt */
send_bits(s, dcodes-1, 5);
send_bits(s, blcodes-4, 4); /* not -3 as stated in appnote.txt */
Tracev(( "\nbl counts: "));
send_bits(s, lcodes - 257, 5); /* not +255 as stated in appnote.txt */
send_bits(s, dcodes - 1, 5);
send_bits(s, blcodes - 4, 4); /* not -3 as stated in appnote.txt */
for (rank = 0; rank < blcodes; rank++) {
Tracev(("\nbl code %2d ", bl_order[rank]));
Tracev(( "\nbl code %2d ", bl_order[rank]));
send_bits(s, s->bl_tree[bl_order[rank]].Len, 3);
}
Tracev(("\nbl tree: sent %ld", s->bits_sent));
Tracev(( "\nbl tree: sent %ld", s->bits_sent));
send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */
Tracev(("\nlit tree: sent %ld", s->bits_sent));
send_tree(s, (ct_data *)s->dyn_ltree, lcodes - 1); /* literal tree */
Tracev(( "\nlit tree: sent %ld", s->bits_sent));
send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */
Tracev(("\ndist tree: sent %ld", s->bits_sent));
send_tree(s, (ct_data *)s->dyn_dtree, dcodes - 1); /* distance tree */
Tracev(( "\ndist tree: sent %ld", s->bits_sent));
}
/* ===========================================================================
@ -876,7 +872,7 @@ void ZLIB_INTERNAL _tr_stored_block(s, buf, stored_len, last)
ulg stored_len; /* length of input block */
int last; /* one if this is the last block for a file */
{
send_bits(s, (STORED_BLOCK<<1)+last, 3); /* send block type */
send_bits(s, (STORED_BLOCK<<1) + last, 3); /* send block type */
bi_windup(s); /* align on byte boundary */
put_short(s, (ush)stored_len);
put_short(s, (ush)~stored_len);
@ -887,7 +883,7 @@ void ZLIB_INTERNAL _tr_stored_block(s, buf, stored_len, last)
s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L;
s->compressed_len += (stored_len + 4) << 3;
s->bits_sent += 2*16;
s->bits_sent += stored_len<<3;
s->bits_sent += stored_len << 3;
#endif
}
@ -937,11 +933,11 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
/* Construct the literal and distance trees */
build_tree(s, (tree_desc *)(&(s->l_desc)));
Tracev(("\nlit data: dyn %ld, stat %ld", s->opt_len,
Tracev(( "\nlit data: dyn %ld, stat %ld", s->opt_len,
s->static_len));
build_tree(s, (tree_desc *)(&(s->d_desc)));
Tracev(("\ndist data: dyn %ld, stat %ld", s->opt_len,
Tracev(( "\ndist data: dyn %ld, stat %ld", s->opt_len,
s->static_len));
/* At this point, opt_len and static_len are the total bit lengths of
* the compressed block data, excluding the tree representations.
@ -953,14 +949,17 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
max_blindex = build_bl_tree(s);
/* Determine the best encoding. Compute the block lengths in bytes. */
opt_lenb = (s->opt_len+3+7)>>3;
static_lenb = (s->static_len+3+7)>>3;
opt_lenb = (s->opt_len + 3 + 7) >> 3;
static_lenb = (s->static_len + 3 + 7) >> 3;
Tracev(("\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ",
Tracev(( "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ",
opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len,
s->sym_next / 3));
if (static_lenb <= opt_lenb) opt_lenb = static_lenb;
#ifndef FORCE_STATIC
if (static_lenb <= opt_lenb || s->strategy == Z_FIXED)
#endif
opt_lenb = static_lenb;
} else {
Assert(buf != (char*)0, "lost buf");
@ -970,7 +969,7 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
#ifdef FORCE_STORED
if (buf != (char*)0) { /* force stored block */
#else
if (stored_len+4 <= opt_lenb && buf != (char*)0) {
if (stored_len + 4 <= opt_lenb && buf != (char*)0) {
/* 4: two words for the lengths */
#endif
/* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE.
@ -981,21 +980,17 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
*/
_tr_stored_block(s, buf, stored_len, last);
#ifdef FORCE_STATIC
} else if (static_lenb >= 0) { /* force static trees */
#else
} else if (s->strategy == Z_FIXED || static_lenb == opt_lenb) {
#endif
send_bits(s, (STATIC_TREES<<1)+last, 3);
} else if (static_lenb == opt_lenb) {
send_bits(s, (STATIC_TREES<<1) + last, 3);
compress_block(s, (const ct_data *)static_ltree,
(const ct_data *)static_dtree);
#ifdef ZLIB_DEBUG
s->compressed_len += 3 + s->static_len;
#endif
} else {
send_bits(s, (DYN_TREES<<1)+last, 3);
send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1,
max_blindex+1);
send_bits(s, (DYN_TREES<<1) + last, 3);
send_all_trees(s, s->l_desc.max_code + 1, s->d_desc.max_code + 1,
max_blindex + 1);
compress_block(s, (const ct_data *)s->dyn_ltree,
(const ct_data *)s->dyn_dtree);
#ifdef ZLIB_DEBUG
@ -1014,18 +1009,18 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
s->compressed_len += 7; /* align on byte boundary */
#endif
}
Tracev(("\ncomprlen %lu(%lu) ", s->compressed_len>>3,
s->compressed_len-7*last));
Tracev(("\ncomprlen %lu(%lu) ", s->compressed_len >> 3,
s->compressed_len - 7*last));
}
/* ===========================================================================
* Save the match info and tally the frequency counts. Return true if
* the current block must be flushed.
*/
int ZLIB_INTERNAL _tr_tally (s, dist, lc)
int ZLIB_INTERNAL _tr_tally(s, dist, lc)
deflate_state *s;
unsigned dist; /* distance of matched string */
unsigned lc; /* match length-MIN_MATCH or unmatched char (if dist==0) */
unsigned lc; /* match length - MIN_MATCH or unmatched char (dist==0) */
{
s->sym_buf[s->sym_next++] = (uch)dist;
s->sym_buf[s->sym_next++] = (uch)(dist >> 8);
@ -1041,7 +1036,7 @@ int ZLIB_INTERNAL _tr_tally (s, dist, lc)
(ush)lc <= (ush)(MAX_MATCH-MIN_MATCH) &&
(ush)d_code(dist) < (ush)D_CODES, "_tr_tally: bad match");
s->dyn_ltree[_length_code[lc]+LITERALS+1].Freq++;
s->dyn_ltree[_length_code[lc] + LITERALS + 1].Freq++;
s->dyn_dtree[d_code(dist)].Freq++;
}
return (s->sym_next == s->sym_end);
@ -1071,7 +1066,7 @@ local void compress_block(s, ltree, dtree)
} else {
/* Here, lc is the match length - MIN_MATCH */
code = _length_code[lc];
send_code(s, code+LITERALS+1, ltree); /* send the length code */
send_code(s, code + LITERALS + 1, ltree); /* send length code */
extra = extra_lbits[code];
if (extra != 0) {
lc -= base_length[code];
@ -1187,6 +1182,6 @@ local void bi_windup(s)
s->bi_buf = 0;
s->bi_valid = 0;
#ifdef ZLIB_DEBUG
s->bits_sent = (s->bits_sent+7) & ~7;
s->bits_sent = (s->bits_sent + 7) & ~7;
#endif
}