Make considerably more progress on AARCH64

- Utilities like pledge.com now build
- kprintf() will no longer balk at 48-bit addresses
- There's a new aarch64-dbg build mode that should work
- gc() and defer() are mostly pacified; avoid using them on aarch64
- THIRD_PART_STB now has Arm Neon intrinsics for fast image handling
This commit is contained in:
Justine Tunney 2023-05-12 22:42:57 -07:00
parent 1bfb3aab1b
commit fd34ef732d
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
91 changed files with 1288 additions and 1192 deletions

View file

@ -18,5 +18,5 @@
*/
void bz_internal_error(int err) {
asm("hlt");
__builtin_trap();
}

View file

@ -34,7 +34,7 @@ static textstartup void BZ2_crc32Table_init() {
BZ2_crc32Table[i] = u;
}
if (BZ2_crc32Table[0] || BZ2_crc32Table[255] != 0xb1f740b4) {
asm("hlt");
__builtin_trap();
}
}

View file

@ -10,6 +10,8 @@
# This makefile compiles and runs each test twice. The first with
# GCC-built chibicc, and a second time with chibicc-built chibicc
ifeq ($(ARCH), x86_64)
CHIBICC = o/$(MODE)/third_party/chibicc/chibicc.com
CHIBICC_FLAGS = \
-fno-common \
@ -111,6 +113,8 @@ THIRD_PARTY_CHIBICC_CHECKS = $(foreach x,$(THIRD_PARTY_CHIBICC_ARTIFACTS),$($(x)
THIRD_PARTY_CHIBICC_OBJS = $(foreach x,$(THIRD_PARTY_CHIBICC_ARTIFACTS),$($(x)_OBJS))
$(THIRD_PARTY_CHIBICC_OBJS): $(BUILD_FILES) third_party/chibicc/chibicc.mk
endif
.PHONY: o/$(MODE)/third_party/chibicc
o/$(MODE)/third_party/chibicc: \
o/$(MODE)/third_party/chibicc/test \

View file

@ -10,6 +10,8 @@
# This makefile compiles and runs each test twice. The first with
# GCC-built chibicc, and a second time with chibicc-built chibicc
ifeq ($(ARCH), x86_64)
PKGS += THIRD_PARTY_CHIBICC_TEST
THIRD_PARTY_CHIBICC_TEST_A = o/$(MODE)/third_party/chibicc/test/test.a
@ -75,6 +77,8 @@ o/$(MODE)/third_party/chibicc/test/%.o: \
o/$(MODE)/third_party/chibicc/test/int128_test.o: private QUOTA = -M1024m
endif
.PHONY: o/$(MODE)/third_party/chibicc/test
o/$(MODE)/third_party/chibicc/test: \
$(THIRD_PARTY_CHIBICC_TEST_BINS) \

View file

@ -1,48 +0,0 @@
/* clang-format off */
/* ===-- lshrti3.c - Implement __lshrti3 -----------------------------------===
*
* The LLVM Compiler Infrastructure
*
* This file is dual licensed under the MIT and the University of Illinois Open
* Source Licenses. See LICENSE.TXT for details.
*
* ===----------------------------------------------------------------------===
*
* This file implements __lshrti3 for the compiler_rt library.
*
* ===----------------------------------------------------------------------===
*/
STATIC_YOINK("huge_compiler_rt_license");
#include "third_party/compiler_rt/int_lib.h"
#ifdef CRT_HAS_128BIT
/* Returns: logical a >> b */
/* Precondition: 0 <= b < bits_in_tword */
COMPILER_RT_ABI ti_int
__lshrti3(ti_int a, si_int b)
{
const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT);
utwords input;
utwords result;
input.all = a;
if (b & bits_in_dword) /* bits_in_dword <= b < bits_in_tword */
{
result.s.high = 0;
result.s.low = input.s.high >> (b - bits_in_dword);
}
else /* 0 <= b < bits_in_dword */
{
if (b == 0)
return a;
result.s.high = input.s.high >> b;
result.s.low = (input.s.high << (bits_in_dword - b)) | (input.s.low >> b);
}
return result.all;
}
#endif /* CRT_HAS_128BIT */

View file

@ -0,0 +1 @@
../../../../x86_64-linux-musl/bin/ld.bfd

View file

@ -68,7 +68,7 @@ static __inline int regexec_e(regex_t *, const char *, int, int, size_t);
static void regsub(SPACE *, char *, char *);
static int substitute(struct s_command *);
struct s_appends *appends; /* Array of pointers to strings to append. */
struct s_appends *appends_; /* Array of pointers to strings to append. */
static size_t appendx; /* Index into appends array. */
size_t appendnum; /* Size of appends array. */
@ -111,12 +111,12 @@ redirect:
goto redirect;
case 'a':
if (appendx >= appendnum)
appends = xrealloc(appends,
appends_ = xrealloc(appends_,
sizeof(struct s_appends) *
(appendnum *= 2));
appends[appendx].type = AP_STRING;
appends[appendx].s = cp->t;
appends[appendx].len = strlen(cp->t);
appends_[appendx].type = AP_STRING;
appends_[appendx].s = cp->t;
appends_[appendx].len = strlen(cp->t);
appendx++;
break;
case 'b':
@ -204,12 +204,12 @@ redirect:
exit(0);
case 'r':
if (appendx >= appendnum)
appends = xrealloc(appends,
appends_ = xrealloc(appends_,
sizeof(struct s_appends) *
(appendnum *= 2));
appends[appendx].type = AP_FILE;
appends[appendx].s = cp->t;
appends[appendx].len = strlen(cp->t);
appends_[appendx].type = AP_FILE;
appends_[appendx].s = cp->t;
appends_[appendx].len = strlen(cp->t);
appendx++;
break;
case 's':
@ -541,9 +541,9 @@ flush_appends(void)
char *buf = gc(malloc(8 * 1024));
for (i = 0; i < appendx; i++)
switch (appends[i].type) {
switch (appends_[i].type) {
case AP_STRING:
fwrite(appends[i].s, sizeof(char), appends[i].len,
fwrite(appends_[i].s, sizeof(char), appends_[i].len,
outfile);
break;
case AP_FILE:
@ -555,7 +555,7 @@ flush_appends(void)
* would be truly bizarre, but possible. It's probably
* not that big a performance win, anyhow.
*/
if ((f = fopen(appends[i].s, "r")) == NULL)
if ((f = fopen(appends_[i].s, "r")) == NULL)
break;
while ((count = fread(buf, sizeof(char), sizeof(buf), f)))
(void)fwrite(buf, sizeof(char), count, outfile);

View file

@ -58,6 +58,7 @@ THIRD_PARTY_SQLITE3_A_DIRECTDEPS = \
LIBC_TIME \
LIBC_TINYMATH \
LIBC_ZIPOS \
THIRD_PARTY_COMPILER_RT \
THIRD_PARTY_GDTOA \
THIRD_PARTY_LINENOISE \
THIRD_PARTY_MUSL \
@ -79,7 +80,8 @@ o/$(MODE)/third_party/sqlite3/sqlite3.com.dbg: \
o/$(MODE)/third_party/sqlite3/sqlite3.com: \
o/$(MODE)/third_party/sqlite3/sqlite3.com.dbg \
o/$(MODE)/third_party/zip/zip.com \
o/$(MODE)/tool/build/symtab.com
o/$(MODE)/tool/build/symtab.com \
$(VM)
@$(MAKE_OBJCOPY)
@$(MAKE_SYMTAB_CREATE)
@$(MAKE_SYMTAB_ZIP)

View file

@ -1,426 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
// Computes inverse discrete cosine transform.
//
// @note used to decode jpeg
.p2align 4
stbi__idct_simd$sse:
push %rbp
mov %rsp,%rbp
movslq %esi,%rsi
lea (%rdi,%rsi),%rax
sub $96,%rsp
movdqa 32(%rdx),%xmm0
movdqa 112(%rdx),%xmm9
movdqa 48(%rdx),%xmm1
movdqa 80(%rdx),%xmm7
movdqa %xmm0,%xmm2
punpcklwd 96(%rdx),%xmm2
punpckhwd 96(%rdx),%xmm0
movdqa %xmm9,%xmm8
movdqa 16(%rdx),%xmm5
movdqa %xmm2,%xmm3
movdqa %xmm2,%xmm6
movdqa %xmm0,%xmm2
pmaddwd .LC1(%rip),%xmm3
movdqa %xmm0,%xmm4
pmaddwd .LC1(%rip),%xmm2
pmaddwd .LC0(%rip),%xmm4
punpckhwd %xmm1,%xmm8
pmaddwd .LC0(%rip),%xmm6
movaps %xmm3,-48(%rbp)
movdqa (%rdx),%xmm3
movaps %xmm2,-64(%rbp)
movdqa 64(%rdx),%xmm2
movdqa %xmm3,%xmm0
movaps %xmm4,-32(%rbp)
paddw %xmm2,%xmm0
psubw %xmm2,%xmm3
movaps %xmm6,-16(%rbp)
movdqa %xmm0,%xmm4
pxor %xmm0,%xmm0
movdqa %xmm0,%xmm11
movdqa %xmm0,%xmm12
movdqa %xmm0,%xmm2
punpcklwd %xmm4,%xmm11
punpckhwd %xmm3,%xmm12
punpcklwd %xmm3,%xmm2
movdqa %xmm11,%xmm13
movdqa %xmm0,%xmm11
movdqa %xmm12,%xmm3
punpckhwd %xmm4,%xmm11
movdqa %xmm8,%xmm12
movdqa %xmm8,%xmm4
movdqa %xmm11,%xmm14
movdqa %xmm7,%xmm8
movdqa %xmm9,%xmm11
punpckhwd %xmm5,%xmm8
psrad $4,%xmm3
punpcklwd %xmm1,%xmm11
psrad $4,%xmm13
psrad $4,%xmm14
movdqa %xmm11,%xmm15
movaps %xmm13,-80(%rbp)
movdqa %xmm8,%xmm6
paddw %xmm7,%xmm1
pmaddwd .LC3(%rip),%xmm15
movaps %xmm14,-96(%rbp)
movdqa %xmm8,%xmm14
movdqa %xmm5,%xmm8
pmaddwd .LC2(%rip),%xmm11
pmaddwd .LC2(%rip),%xmm12
paddw %xmm9,%xmm8
psrad $4,%xmm2
pmaddwd .LC3(%rip),%xmm4
pmaddwd .LC5(%rip),%xmm6
pmaddwd .LC4(%rip),%xmm14
movdqa %xmm4,%xmm10
movdqa %xmm7,%xmm4
movdqa %xmm8,%xmm7
punpcklwd %xmm5,%xmm4
punpcklwd %xmm1,%xmm7
punpckhwd %xmm1,%xmm8
movdqa %xmm4,%xmm13
movdqa %xmm7,%xmm9
pmaddwd .LC5(%rip),%xmm4
pmaddwd .LC6(%rip),%xmm9
movdqa %xmm8,%xmm5
movdqa %xmm7,%xmm1
pmaddwd .LC7(%rip),%xmm8
pmaddwd .LC6(%rip),%xmm5
movdqa %xmm15,%xmm7
paddd %xmm9,%xmm11
paddd %xmm9,%xmm4
movdqa .LC8(%rip),%xmm9
paddd %xmm8,%xmm14
paddd %xmm10,%xmm8
movdqa -96(%rbp),%xmm10
paddd -64(%rbp),%xmm10
pmaddwd .LC7(%rip),%xmm1
pmaddwd .LC4(%rip),%xmm13
paddd %xmm5,%xmm12
paddd %xmm5,%xmm6
paddd %xmm9,%xmm10
movdqa -80(%rbp),%xmm5
paddd -48(%rbp),%xmm5
paddd %xmm1,%xmm13
paddd %xmm1,%xmm7
movdqa %xmm10,%xmm1
psubd %xmm6,%xmm10
paddd %xmm9,%xmm5
paddd %xmm6,%xmm1
psrad $10,%xmm10
movdqa -16(%rbp),%xmm6
movdqa %xmm1,%xmm15
movdqa %xmm5,%xmm1
psubd %xmm4,%xmm5
psrad $10,%xmm5
paddd %xmm4,%xmm1
paddd %xmm2,%xmm6
packssdw %xmm10,%xmm5
movdqa -32(%rbp),%xmm10
paddd %xmm9,%xmm6
paddd %xmm9,%xmm2
psrad $10,%xmm15
psrad $10,%xmm1
psubd -16(%rbp),%xmm2
paddd %xmm3,%xmm10
paddd %xmm9,%xmm3
packssdw %xmm15,%xmm1
paddd %xmm9,%xmm10
psubd -32(%rbp),%xmm3
movdqa %xmm10,%xmm4
psubd %xmm8,%xmm10
paddd %xmm8,%xmm4
psrad $10,%xmm10
movdqa %xmm4,%xmm15
movdqa %xmm6,%xmm4
psubd %xmm7,%xmm6
psrad $10,%xmm6
psrad $10,%xmm15
paddd %xmm7,%xmm4
movdqa %xmm3,%xmm7
psubd %xmm14,%xmm3
packssdw %xmm10,%xmm6
psrad $10,%xmm3
psrad $10,%xmm4
paddd %xmm14,%xmm7
movdqa %xmm7,%xmm8
movdqa %xmm2,%xmm7
psubd %xmm13,%xmm2
paddd %xmm13,%xmm7
psrad $10,%xmm8
packssdw %xmm15,%xmm4
psrad $10,%xmm7
psrad $10,%xmm2
packssdw %xmm8,%xmm7
movdqa -80(%rbp),%xmm8
packssdw %xmm3,%xmm2
paddd %xmm9,%xmm8
paddd -96(%rbp),%xmm9
psubd -48(%rbp),%xmm8
psubd -64(%rbp),%xmm9
movdqa %xmm8,%xmm3
movdqa %xmm9,%xmm10
psubd %xmm11,%xmm8
paddd %xmm12,%xmm10
paddd %xmm11,%xmm3
psrad $10,%xmm8
psrad $10,%xmm10
psrad $10,%xmm3
psubd %xmm12,%xmm9
psrad $10,%xmm9
packssdw %xmm10,%xmm3
movdqa %xmm1,%xmm10
packssdw %xmm9,%xmm8
movdqa %xmm7,%xmm9
punpckhwd %xmm6,%xmm7
punpcklwd %xmm6,%xmm9
punpcklwd %xmm8,%xmm10
punpckhwd %xmm8,%xmm1
movdqa %xmm3,%xmm6
movdqa %xmm4,%xmm8
punpckhwd %xmm5,%xmm3
punpcklwd %xmm5,%xmm6
punpcklwd %xmm2,%xmm8
movdqa %xmm3,%xmm5
punpckhwd %xmm2,%xmm4
movdqa %xmm8,%xmm3
movdqa %xmm10,%xmm2
punpckhwd %xmm6,%xmm8
punpcklwd %xmm6,%xmm3
punpcklwd %xmm9,%xmm2
movdqa %xmm8,%xmm6
movdqa %xmm4,%xmm8
punpckhwd %xmm9,%xmm10
punpcklwd %xmm5,%xmm8
punpckhwd %xmm5,%xmm4
movdqa %xmm2,%xmm5
punpcklwd %xmm3,%xmm5
punpckhwd %xmm3,%xmm2
movdqa %xmm1,%xmm15
movdqa %xmm10,%xmm3
punpckhwd %xmm7,%xmm1
punpckhwd %xmm6,%xmm10
punpcklwd %xmm6,%xmm3
movdqa %xmm1,%xmm6
punpckhwd %xmm4,%xmm1
punpcklwd %xmm4,%xmm6
movdqa %xmm3,%xmm4
punpcklwd %xmm7,%xmm15
punpcklwd %xmm6,%xmm4
punpckhwd %xmm6,%xmm3
movdqa %xmm15,%xmm7
movdqa %xmm4,%xmm6
punpcklwd %xmm8,%xmm7
movdqa %xmm3,%xmm11
movdqa %xmm4,%xmm12
movdqa %xmm3,%xmm4
movdqa %xmm5,%xmm3
paddw %xmm7,%xmm3
movdqa %xmm1,%xmm9
punpckhwd %xmm8,%xmm15
punpcklwd %xmm10,%xmm9
psubw %xmm7,%xmm5
movdqa %xmm15,%xmm7
movdqa %xmm9,%xmm14
punpcklwd %xmm2,%xmm7
movdqa %xmm1,%xmm8
pmaddwd .LC0(%rip),%xmm6
punpckhwd %xmm10,%xmm8
paddw %xmm15,%xmm10
movaps %xmm6,-16(%rbp)
pmaddwd .LC1(%rip),%xmm4
movdqa %xmm0,%xmm6
pmaddwd .LC0(%rip),%xmm11
pmaddwd .LC2(%rip),%xmm14
pmaddwd .LC1(%rip),%xmm12
pmaddwd .LC3(%rip),%xmm9
movaps %xmm4,-64(%rbp)
movdqa %xmm3,%xmm4
movdqa %xmm0,%xmm3
punpckhwd %xmm4,%xmm6
punpcklwd %xmm4,%xmm3
movdqa %xmm0,%xmm4
movaps %xmm11,-32(%rbp)
movdqa %xmm6,%xmm13
movdqa %xmm15,%xmm6
punpcklwd %xmm5,%xmm4
movaps %xmm12,-48(%rbp)
punpckhwd %xmm2,%xmm6
paddw %xmm1,%xmm2
punpckhwd %xmm5,%xmm0
movdqa %xmm14,%xmm11
movdqa %xmm2,%xmm5
movdqa %xmm7,%xmm14
punpckhwd %xmm10,%xmm2
psrad $4,%xmm13
punpcklwd %xmm10,%xmm5
movaps %xmm13,-80(%rbp)
movdqa %xmm8,%xmm12
movdqa %xmm5,%xmm10
pmaddwd .LC4(%rip),%xmm14
pmaddwd .LC6(%rip),%xmm10
movdqa %xmm2,%xmm15
pmaddwd .LC7(%rip),%xmm5
pmaddwd .LC3(%rip),%xmm8
pmaddwd .LC5(%rip),%xmm7
movdqa %xmm14,%xmm13
movdqa %xmm6,%xmm14
paddd %xmm5,%xmm13
paddd %xmm5,%xmm9
pmaddwd .LC5(%rip),%xmm6
psrad $4,%xmm3
pmaddwd .LC6(%rip),%xmm15
paddd %xmm10,%xmm7
paddd %xmm10,%xmm11
psrad $4,%xmm4
pmaddwd .LC2(%rip),%xmm12
psrad $4,%xmm0
pmaddwd .LC4(%rip),%xmm14
pmaddwd .LC7(%rip),%xmm2
movdqa -80(%rbp),%xmm5
paddd %xmm15,%xmm12
paddd -64(%rbp),%xmm5
paddd %xmm2,%xmm14
paddd %xmm8,%xmm2
movdqa -48(%rbp),%xmm8
paddd %xmm6,%xmm15
movdqa .LC9(%rip),%xmm6
paddd %xmm3,%xmm8
paddd %xmm6,%xmm8
paddd %xmm6,%xmm5
movdqa %xmm5,%xmm10
movdqa %xmm8,%xmm1
psubd %xmm15,%xmm5
psubd %xmm7,%xmm8
psrad $17,%xmm5
paddd %xmm7,%xmm1
movdqa -32(%rbp),%xmm7
psrad $17,%xmm8
paddd %xmm15,%xmm10
paddd %xmm6,%xmm3
packssdw %xmm5,%xmm8
movdqa -16(%rbp),%xmm5
paddd %xmm0,%xmm7
paddd %xmm6,%xmm0
paddd %xmm6,%xmm7
psrad $17,%xmm10
psubd -32(%rbp),%xmm0
paddd %xmm4,%xmm5
psrad $17,%xmm1
movdqa %xmm7,%xmm15
paddd %xmm6,%xmm5
packssdw %xmm10,%xmm1
psubd %xmm2,%xmm7
movdqa %xmm5,%xmm10
paddd %xmm6,%xmm4
psubd %xmm9,%xmm5
psubd -16(%rbp),%xmm4
psrad $17,%xmm7
paddd %xmm2,%xmm15
psrad $17,%xmm5
psubd -48(%rbp),%xmm3
paddd -80(%rbp),%xmm6
packssdw %xmm7,%xmm5
movdqa %xmm4,%xmm2
movdqa %xmm0,%xmm7
psubd -64(%rbp),%xmm6
paddd %xmm14,%xmm7
psrad $17,%xmm15
paddd %xmm13,%xmm2
psubd %xmm14,%xmm0
psrad $17,%xmm7
psubd %xmm13,%xmm4
psrad $17,%xmm0
paddd %xmm9,%xmm10
psrad $17,%xmm2
psrad $17,%xmm4
packuswb %xmm8,%xmm5
packssdw %xmm0,%xmm4
packssdw %xmm7,%xmm2
movdqa %xmm3,%xmm0
movdqa %xmm6,%xmm7
psrad $17,%xmm10
paddd %xmm11,%xmm0
paddd %xmm12,%xmm7
psubd %xmm12,%xmm6
packssdw %xmm15,%xmm10
psubd %xmm11,%xmm3
psrad $17,%xmm7
packuswb %xmm10,%xmm1
psrad $17,%xmm0
psrad $17,%xmm6
psrad $17,%xmm3
packssdw %xmm7,%xmm0
packssdw %xmm6,%xmm3
packuswb %xmm0,%xmm2
movdqa %xmm1,%xmm0
packuswb %xmm4,%xmm3
movdqa %xmm2,%xmm4
punpckhbw %xmm5,%xmm2
punpcklbw %xmm3,%xmm0
punpcklbw %xmm5,%xmm4
punpckhbw %xmm3,%xmm1
movdqa %xmm2,%xmm3
movdqa %xmm0,%xmm2
movdqa %xmm1,%xmm5
punpcklbw %xmm4,%xmm2
punpckhbw %xmm4,%xmm0
punpcklbw %xmm3,%xmm5
movdqa %xmm2,%xmm4
punpckhbw %xmm5,%xmm2
punpckhbw %xmm3,%xmm1
punpcklbw %xmm5,%xmm4
movdqa %xmm0,%xmm3
punpckhbw %xmm1,%xmm0
movq %xmm4,(%rdi)
pshufd $78,%xmm4,%xmm4
punpcklbw %xmm1,%xmm3
movq %xmm4,(%rax)
add %rsi,%rax
movq %xmm2,(%rax)
add %rsi,%rax
pshufd $78,%xmm2,%xmm2
movq %xmm2,(%rax)
add %rsi,%rax
movq %xmm3,(%rax)
add %rsi,%rax
pshufd $78,%xmm3,%xmm3
movq %xmm3,(%rax)
movq %xmm0,(%rax,%rsi)
pshufd $78,%xmm0,%xmm0
movq %xmm0,(%rax,%rsi,2)
leave
ret
.endfn stbi__idct_simd$sse,globl
.rodata.cst16
.LC0: .value 2217,-5350,2217,-5350,2217,-5350,2217,-5350
.LC1: .value 5352,2217,5352,2217,5352,2217,5352,2217
.LC2: .value -6811,-8034,-6811,-8034,-6811,-8034,-6811,-8034
.LC3: .value -8034,4552,-8034,4552,-8034,4552,-8034,4552
.LC4: .value 6813,-1597,6813,-1597,6813,-1597,6813,-1597
.LC5: .value -1597,4552,-1597,4552,-1597,4552,-1597,4552
.LC6: .value 1131,4816,1131,4816,1131,4816,1131,4816
.LC7: .value 4816,-5681,4816,-5681,4816,-5681,4816,-5681
.LC8: .long 0x200,0x200,0x200,0x200
.LC9: .long 0x1010000,0x1010000,0x1010000,0x1010000

View file

@ -1,19 +0,0 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_STB_INTERNAL_H_
#define COSMOPOLITAN_THIRD_PARTY_STB_INTERNAL_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
void stbi__YCbCr_to_RGB_row(unsigned char *, const unsigned char *,
const unsigned char *, const unsigned char *,
unsigned, unsigned) _Hide;
int stbi__YCbCr_to_RGB_row$sse2(unsigned char *, const unsigned char *,
const unsigned char *, const unsigned char *,
unsigned) _Hide;
void stbi__idct_simd$sse(unsigned char *out, int out_stride,
short data[64]) _Hide;
void stbi__idct_simd$avx(unsigned char *out, int out_stride,
short data[64]) _Hide;
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_STB_INTERNAL_H_ */

View file

@ -16,6 +16,7 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "third_party/stb/stb_image.h"
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/fmt/conv.h"
@ -31,20 +32,33 @@
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
#include "libc/x/x.h"
#include "third_party/stb/internal.h"
#include "third_party/stb/stb_image.h"
#define ROL(w, k) ((w) << (k) | CheckUnsigned(w) >> (sizeof(w) * 8 - (k)))
#include "third_party/aarch64/arm_neon.h"
#include "third_party/intel/ammintrin.internal.h"
asm(".ident\t\"\\n\\n\
stb_image (Public Domain)\\n\
Credit: Sean Barrett, et al.\\n\
http://nothings.org/stb\"");
#ifdef __x86_64__
#define STBI_SSE2
#define idct_block_kernel stbi__idct_simd
#elif defined(__aarch64__)
#define STBI_NEON
#define idct_block_kernel stbi__idct_simd
#else
#define idct_block_kernel stbi__idct_block
#endif
#define ROL(w, k) ((w) << (k) | CheckUnsigned(w) >> (sizeof(w) * 8 - (k)))
#ifndef STBI_REALLOC_SIZED
#define STBI_REALLOC_SIZED(p, oldsz, newsz) realloc(p, newsz)
#endif
typedef unsigned char stbi_uc;
typedef unsigned short stbi_us;
// stbi__context structure is our basic context used by all images, so it
// contains all the IO context, plus some basic image information
typedef struct {
@ -1219,6 +1233,556 @@ forceinline unsigned char stbi__clamp(int x) {
return (unsigned char)x;
}
#define stbi__f2f(x) ((int)(((x)*4096 + 0.5)))
#define stbi__fsh(x) ((x)*4096)
// derived from jidctint -- DCT_ISLOW
#define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7) \
int t0, t1, t2, t3, p1, p2, p3, p4, p5, x0, x1, x2, x3; \
p2 = s2; \
p3 = s6; \
p1 = (p2 + p3) * stbi__f2f(0.5411961f); \
t2 = p1 + p3 * stbi__f2f(-1.847759065f); \
t3 = p1 + p2 * stbi__f2f(0.765366865f); \
p2 = s0; \
p3 = s4; \
t0 = stbi__fsh(p2 + p3); \
t1 = stbi__fsh(p2 - p3); \
x0 = t0 + t3; \
x3 = t0 - t3; \
x1 = t1 + t2; \
x2 = t1 - t2; \
t0 = s7; \
t1 = s5; \
t2 = s3; \
t3 = s1; \
p3 = t0 + t2; \
p4 = t1 + t3; \
p1 = t0 + t3; \
p2 = t1 + t2; \
p5 = (p3 + p4) * stbi__f2f(1.175875602f); \
t0 = t0 * stbi__f2f(0.298631336f); \
t1 = t1 * stbi__f2f(2.053119869f); \
t2 = t2 * stbi__f2f(3.072711026f); \
t3 = t3 * stbi__f2f(1.501321110f); \
p1 = p5 + p1 * stbi__f2f(-0.899976223f); \
p2 = p5 + p2 * stbi__f2f(-2.562915447f); \
p3 = p3 * stbi__f2f(-1.961570560f); \
p4 = p4 * stbi__f2f(-0.390180644f); \
t3 += p1 + p4; \
t2 += p2 + p3; \
t1 += p2 + p4; \
t0 += p1 + p3;
static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64]) {
int i, val[64], *v = val;
stbi_uc *o;
short *d = data;
// columns
for (i = 0; i < 8; ++i, ++d, ++v) {
// if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 &&
d[48] == 0 && d[56] == 0) {
// no shortcut 0 seconds
// (1|2|3|4|5|6|7)==0 0 seconds
// all separate -0.047 seconds
// 1 && 2|3 && 4|5 && 6|7: -0.047 seconds
int dcterm = d[0] * 4;
v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
} else {
STBI__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56])
// constants scaled things up by 1<<12; let's bring them back
// down, but keep 2 extra bits of precision
x0 += 512;
x1 += 512;
x2 += 512;
x3 += 512;
v[0] = (x0 + t3) >> 10;
v[56] = (x0 - t3) >> 10;
v[8] = (x1 + t2) >> 10;
v[48] = (x1 - t2) >> 10;
v[16] = (x2 + t1) >> 10;
v[40] = (x2 - t1) >> 10;
v[24] = (x3 + t0) >> 10;
v[32] = (x3 - t0) >> 10;
}
}
for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) {
// no fast case since the first 1D IDCT spread components out
STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
// constants scaled things up by 1<<12, plus we had 1<<2 from first
// loop, plus horizontal and vertical each scale by sqrt(8) so together
// we've got an extra 1<<3, so 1<<17 total we need to remove.
// so we want to round that, which means adding 0.5 * 1<<17,
// aka 65536. Also, we'll end up with -128 to 127 that we want
// to encode as 0..255 by adding 128, so we'll add that before the shift
x0 += 65536 + (128 << 17);
x1 += 65536 + (128 << 17);
x2 += 65536 + (128 << 17);
x3 += 65536 + (128 << 17);
// tried computing the shifts into temps, or'ing the temps to see
// if any were out of range, but that was slower
o[0] = stbi__clamp((x0 + t3) >> 17);
o[7] = stbi__clamp((x0 - t3) >> 17);
o[1] = stbi__clamp((x1 + t2) >> 17);
o[6] = stbi__clamp((x1 - t2) >> 17);
o[2] = stbi__clamp((x2 + t1) >> 17);
o[5] = stbi__clamp((x2 - t1) >> 17);
o[3] = stbi__clamp((x3 + t0) >> 17);
o[4] = stbi__clamp((x3 - t0) >> 17);
}
}
#ifdef STBI_SSE2
// sse2 integer IDCT. not the fastest possible implementation but it
// produces bit-identical results to the generic C version so it's
// fully "transparent".
static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
// This is constructed to match our regular (generic) integer IDCT exactly.
__m128i row0, row1, row2, row3, row4, row5, row6, row7;
__m128i tmp;
// dot product constant: even elems=x, odd elems=y
#define dct_const(x, y) _mm_setr_epi16((x), (y), (x), (y), (x), (y), (x), (y))
// out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit)
// out(1) = c1[even]*x + c1[odd]*y
#define dct_rot(out0, out1, x, y, c0, c1) \
__m128i c0##lo = _mm_unpacklo_epi16((x), (y)); \
__m128i c0##hi = _mm_unpackhi_epi16((x), (y)); \
__m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
__m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
__m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
__m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
// out = in << 12 (in 16-bit, out 32-bit)
#define dct_widen(out, in) \
__m128i out##_l = \
_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
__m128i out##_h = \
_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
// wide add
#define dct_wadd(out, a, b) \
__m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
__m128i out##_h = _mm_add_epi32(a##_h, b##_h)
// wide sub
#define dct_wsub(out, a, b) \
__m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
__m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
// butterfly a/b, add bias, then shift by "s" and pack
#define dct_bfly32o(out0, out1, a, b, bias, s) \
{ \
__m128i abiased_l = _mm_add_epi32(a##_l, bias); \
__m128i abiased_h = _mm_add_epi32(a##_h, bias); \
dct_wadd(sum, abiased, b); \
dct_wsub(dif, abiased, b); \
out0 = \
_mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
out1 = \
_mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
}
// 8-bit interleave step (for transposes)
#define dct_interleave8(a, b) \
tmp = a; \
a = _mm_unpacklo_epi8(a, b); \
b = _mm_unpackhi_epi8(tmp, b)
// 16-bit interleave step (for transposes)
#define dct_interleave16(a, b) \
tmp = a; \
a = _mm_unpacklo_epi16(a, b); \
b = _mm_unpackhi_epi16(tmp, b)
#define dct_pass(bias, shift) \
{ \
/* even part */ \
dct_rot(t2e, t3e, row2, row6, rot0_0, rot0_1); \
__m128i sum04 = _mm_add_epi16(row0, row4); \
__m128i dif04 = _mm_sub_epi16(row0, row4); \
dct_widen(t0e, sum04); \
dct_widen(t1e, dif04); \
dct_wadd(x0, t0e, t3e); \
dct_wsub(x3, t0e, t3e); \
dct_wadd(x1, t1e, t2e); \
dct_wsub(x2, t1e, t2e); \
/* odd part */ \
dct_rot(y0o, y2o, row7, row3, rot2_0, rot2_1); \
dct_rot(y1o, y3o, row5, row1, rot3_0, rot3_1); \
__m128i sum17 = _mm_add_epi16(row1, row7); \
__m128i sum35 = _mm_add_epi16(row3, row5); \
dct_rot(y4o, y5o, sum17, sum35, rot1_0, rot1_1); \
dct_wadd(x4, y0o, y4o); \
dct_wadd(x5, y1o, y5o); \
dct_wadd(x6, y2o, y5o); \
dct_wadd(x7, y3o, y4o); \
dct_bfly32o(row0, row7, x0, x7, bias, shift); \
dct_bfly32o(row1, row6, x1, x6, bias, shift); \
dct_bfly32o(row2, row5, x2, x5, bias, shift); \
dct_bfly32o(row3, row4, x3, x4, bias, shift); \
}
__m128i rot0_0 = dct_const(stbi__f2f(0.5411961f),
stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
__m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f(0.765366865f),
stbi__f2f(0.5411961f));
__m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f),
stbi__f2f(1.175875602f));
__m128i rot1_1 =
dct_const(stbi__f2f(1.175875602f),
stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
__m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f(0.298631336f),
stbi__f2f(-1.961570560f));
__m128i rot2_1 =
dct_const(stbi__f2f(-1.961570560f),
stbi__f2f(-1.961570560f) + stbi__f2f(3.072711026f));
__m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f(2.053119869f),
stbi__f2f(-0.390180644f));
__m128i rot3_1 =
dct_const(stbi__f2f(-0.390180644f),
stbi__f2f(-0.390180644f) + stbi__f2f(1.501321110f));
// rounding biases in column/row passes, see stbi__idct_block for explanation.
__m128i bias_0 = _mm_set1_epi32(512);
__m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
// load
row0 = _mm_load_si128((const __m128i *)(data + 0 * 8));
row1 = _mm_load_si128((const __m128i *)(data + 1 * 8));
row2 = _mm_load_si128((const __m128i *)(data + 2 * 8));
row3 = _mm_load_si128((const __m128i *)(data + 3 * 8));
row4 = _mm_load_si128((const __m128i *)(data + 4 * 8));
row5 = _mm_load_si128((const __m128i *)(data + 5 * 8));
row6 = _mm_load_si128((const __m128i *)(data + 6 * 8));
row7 = _mm_load_si128((const __m128i *)(data + 7 * 8));
// column pass
dct_pass(bias_0, 10);
{
// 16bit 8x8 transpose pass 1
dct_interleave16(row0, row4);
dct_interleave16(row1, row5);
dct_interleave16(row2, row6);
dct_interleave16(row3, row7);
// transpose pass 2
dct_interleave16(row0, row2);
dct_interleave16(row1, row3);
dct_interleave16(row4, row6);
dct_interleave16(row5, row7);
// transpose pass 3
dct_interleave16(row0, row1);
dct_interleave16(row2, row3);
dct_interleave16(row4, row5);
dct_interleave16(row6, row7);
}
// row pass
dct_pass(bias_1, 17);
{
// pack
__m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
__m128i p1 = _mm_packus_epi16(row2, row3);
__m128i p2 = _mm_packus_epi16(row4, row5);
__m128i p3 = _mm_packus_epi16(row6, row7);
// 8bit 8x8 transpose pass 1
dct_interleave8(p0, p2); // a0e0a1e1...
dct_interleave8(p1, p3); // c0g0c1g1...
// transpose pass 2
dct_interleave8(p0, p1); // a0c0e0g0...
dct_interleave8(p2, p3); // b0d0f0h0...
// transpose pass 3
dct_interleave8(p0, p2); // a0b0c0d0...
dct_interleave8(p1, p3); // a4b4c4d4...
// store
_mm_storel_epi64((__m128i *)out, p0);
out += out_stride;
_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p0, 0x4e));
out += out_stride;
_mm_storel_epi64((__m128i *)out, p2);
out += out_stride;
_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p2, 0x4e));
out += out_stride;
_mm_storel_epi64((__m128i *)out, p1);
out += out_stride;
_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p1, 0x4e));
out += out_stride;
_mm_storel_epi64((__m128i *)out, p3);
out += out_stride;
_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p3, 0x4e));
}
#undef dct_const
#undef dct_rot
#undef dct_widen
#undef dct_wadd
#undef dct_wsub
#undef dct_bfly32o
#undef dct_interleave8
#undef dct_interleave16
#undef dct_pass
}
#endif // STBI_SSE2
#ifdef STBI_NEON
// NEON integer IDCT. should produce bit-identical
// results to the generic C version.
static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
int16x4_t rot0_2 = vdup_n_s16(stbi__f2f(0.765366865f));
int16x4_t rot1_0 = vdup_n_s16(stbi__f2f(1.175875602f));
int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
int16x4_t rot3_0 = vdup_n_s16(stbi__f2f(0.298631336f));
int16x4_t rot3_1 = vdup_n_s16(stbi__f2f(2.053119869f));
int16x4_t rot3_2 = vdup_n_s16(stbi__f2f(3.072711026f));
int16x4_t rot3_3 = vdup_n_s16(stbi__f2f(1.501321110f));
#define dct_long_mul(out, inq, coeff) \
int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
#define dct_long_mac(out, acc, inq, coeff) \
int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
#define dct_widen(out, inq) \
int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
// wide add
#define dct_wadd(out, a, b) \
int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
// wide sub
#define dct_wsub(out, a, b) \
int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
// butterfly a/b, then shift using "shiftop" by "s" and pack
#define dct_bfly32o(out0, out1, a, b, shiftop, s) \
{ \
dct_wadd(sum, a, b); \
dct_wsub(dif, a, b); \
out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
}
#define dct_pass(shiftop, shift) \
{ \
/* even part */ \
int16x8_t sum26 = vaddq_s16(row2, row6); \
dct_long_mul(p1e, sum26, rot0_0); \
dct_long_mac(t2e, p1e, row6, rot0_1); \
dct_long_mac(t3e, p1e, row2, rot0_2); \
int16x8_t sum04 = vaddq_s16(row0, row4); \
int16x8_t dif04 = vsubq_s16(row0, row4); \
dct_widen(t0e, sum04); \
dct_widen(t1e, dif04); \
dct_wadd(x0, t0e, t3e); \
dct_wsub(x3, t0e, t3e); \
dct_wadd(x1, t1e, t2e); \
dct_wsub(x2, t1e, t2e); \
/* odd part */ \
int16x8_t sum15 = vaddq_s16(row1, row5); \
int16x8_t sum17 = vaddq_s16(row1, row7); \
int16x8_t sum35 = vaddq_s16(row3, row5); \
int16x8_t sum37 = vaddq_s16(row3, row7); \
int16x8_t sumodd = vaddq_s16(sum17, sum35); \
dct_long_mul(p5o, sumodd, rot1_0); \
dct_long_mac(p1o, p5o, sum17, rot1_1); \
dct_long_mac(p2o, p5o, sum35, rot1_2); \
dct_long_mul(p3o, sum37, rot2_0); \
dct_long_mul(p4o, sum15, rot2_1); \
dct_wadd(sump13o, p1o, p3o); \
dct_wadd(sump24o, p2o, p4o); \
dct_wadd(sump23o, p2o, p3o); \
dct_wadd(sump14o, p1o, p4o); \
dct_long_mac(x4, sump13o, row7, rot3_0); \
dct_long_mac(x5, sump24o, row5, rot3_1); \
dct_long_mac(x6, sump23o, row3, rot3_2); \
dct_long_mac(x7, sump14o, row1, rot3_3); \
dct_bfly32o(row0, row7, x0, x7, shiftop, shift); \
dct_bfly32o(row1, row6, x1, x6, shiftop, shift); \
dct_bfly32o(row2, row5, x2, x5, shiftop, shift); \
dct_bfly32o(row3, row4, x3, x4, shiftop, shift); \
}
// load
row0 = vld1q_s16(data + 0 * 8);
row1 = vld1q_s16(data + 1 * 8);
row2 = vld1q_s16(data + 2 * 8);
row3 = vld1q_s16(data + 3 * 8);
row4 = vld1q_s16(data + 4 * 8);
row5 = vld1q_s16(data + 5 * 8);
row6 = vld1q_s16(data + 6 * 8);
row7 = vld1q_s16(data + 7 * 8);
// add DC bias
row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
// column pass
dct_pass(vrshrn_n_s32, 10);
// 16bit 8x8 transpose
{
// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
// whether compilers actually get this is another story, sadly.
#define dct_trn16(x, y) \
{ \
int16x8x2_t t = vtrnq_s16(x, y); \
x = t.val[0]; \
y = t.val[1]; \
}
#define dct_trn32(x, y) \
{ \
int32x4x2_t t = \
vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); \
x = vreinterpretq_s16_s32(t.val[0]); \
y = vreinterpretq_s16_s32(t.val[1]); \
}
#define dct_trn64(x, y) \
{ \
int16x8_t x0 = x; \
int16x8_t y0 = y; \
x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); \
y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); \
}
// pass 1
dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
dct_trn16(row2, row3);
dct_trn16(row4, row5);
dct_trn16(row6, row7);
// pass 2
dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
dct_trn32(row1, row3);
dct_trn32(row4, row6);
dct_trn32(row5, row7);
// pass 3
dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
dct_trn64(row1, row5);
dct_trn64(row2, row6);
dct_trn64(row3, row7);
#undef dct_trn16
#undef dct_trn32
#undef dct_trn64
}
// row pass
// vrshrn_n_s32 only supports shifts up to 16, we need
// 17. so do a non-rounding shift of 16 first then follow
// up with a rounding shift by 1.
dct_pass(vshrn_n_s32, 16);
{
// pack and round
uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
// again, these can translate into one instruction, but often don't.
#define dct_trn8_8(x, y) \
{ \
uint8x8x2_t t = vtrn_u8(x, y); \
x = t.val[0]; \
y = t.val[1]; \
}
#define dct_trn8_16(x, y) \
{ \
uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); \
x = vreinterpret_u8_u16(t.val[0]); \
y = vreinterpret_u8_u16(t.val[1]); \
}
#define dct_trn8_32(x, y) \
{ \
uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); \
x = vreinterpret_u8_u32(t.val[0]); \
y = vreinterpret_u8_u32(t.val[1]); \
}
// sadly can't use interleaved stores here since we only write
// 8 bytes to each scan line!
// 8x8 8-bit transpose pass 1
dct_trn8_8(p0, p1);
dct_trn8_8(p2, p3);
dct_trn8_8(p4, p5);
dct_trn8_8(p6, p7);
// pass 2
dct_trn8_16(p0, p2);
dct_trn8_16(p1, p3);
dct_trn8_16(p4, p6);
dct_trn8_16(p5, p7);
// pass 3
dct_trn8_32(p0, p4);
dct_trn8_32(p1, p5);
dct_trn8_32(p2, p6);
dct_trn8_32(p3, p7);
// store
vst1_u8(out, p0);
out += out_stride;
vst1_u8(out, p1);
out += out_stride;
vst1_u8(out, p2);
out += out_stride;
vst1_u8(out, p3);
out += out_stride;
vst1_u8(out, p4);
out += out_stride;
vst1_u8(out, p5);
out += out_stride;
vst1_u8(out, p6);
out += out_stride;
vst1_u8(out, p7);
#undef dct_trn8_8
#undef dct_trn8_16
#undef dct_trn8_32
}
#undef dct_long_mul
#undef dct_long_mac
#undef dct_widen
#undef dct_wadd
#undef dct_wsub
#undef dct_bfly32o
#undef dct_pass
}
#endif // STBI_NEON
#define STBI__MARKER_none 0xff
// if there's a pending marker from the entropy stream, return that
// otherwise, fetch from the stream and get a marker. if there's no
@ -1275,7 +1839,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg *z) {
z->huff_ac + ha, z->fast_ac[ha], n,
z->dequant[z->img_comp[n].tq]))
return 0;
stbi__idct_simd$sse(
idct_block_kernel(
z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8,
z->img_comp[n].w2, data);
// every data block is an MCU, so countdown the restart interval
@ -1309,7 +1873,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg *z) {
z->huff_ac + ha, z->fast_ac[ha], n,
z->dequant[z->img_comp[n].tq]))
return 0;
stbi__idct_simd$sse(
idct_block_kernel(
z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2,
z->img_comp[n].w2, data);
}
@ -1411,7 +1975,7 @@ static void stbi__jpeg_finish(stbi__jpeg *z) {
short *data =
z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
stbi__idct_simd$sse(
idct_block_kernel(
z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8,
z->img_comp[n].w2, data);
}
@ -1905,6 +2469,203 @@ static unsigned char *stbi__resample_row_nearest(unsigned char *out,
return out;
}
// this is a reduced-precision calculation of YCbCr-to-RGB introduced
// to make sure the code produces the same results in both SIMD and scalar
#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8)
static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y,
const stbi_uc *pcb, const stbi_uc *pcr,
int count, int step) {
int i;
for (i = 0; i < count; ++i) {
int y_fixed = (y[i] << 20) + (1 << 19); // rounding
int r, g, b;
int cr = pcr[i] - 128;
int cb = pcb[i] - 128;
r = y_fixed + cr * stbi__float2fixed(1.40200f);
g = y_fixed + (cr * -stbi__float2fixed(0.71414f)) +
((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
b = y_fixed + cb * stbi__float2fixed(1.77200f);
r >>= 20;
g >>= 20;
b >>= 20;
if ((unsigned)r > 255) {
if (r < 0)
r = 0;
else
r = 255;
}
if ((unsigned)g > 255) {
if (g < 0)
g = 0;
else
g = 255;
}
if ((unsigned)b > 255) {
if (b < 0)
b = 0;
else
b = 255;
}
out[0] = (stbi_uc)r;
out[1] = (stbi_uc)g;
out[2] = (stbi_uc)b;
out[3] = 255;
out += step;
}
}
#if defined(STBI_SSE2) || defined(STBI_NEON)
static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y,
stbi_uc const *pcb, stbi_uc const *pcr,
int count, int step) {
int i = 0;
#ifdef STBI_SSE2
// step == 3 is pretty ugly on the final interleave, and i'm not convinced
// it's useful in practice (you wouldn't use it for textures, for example).
// so just accelerate step == 4 case.
if (step == 4) {
// this is a fairly straightforward implementation and not super-optimized.
__m128i signflip = _mm_set1_epi8(-0x80);
__m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
__m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
__m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
__m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
__m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
__m128i xw = _mm_set1_epi16(255); // alpha channel
for (; i + 7 < count; i += 8) {
// load
__m128i y_bytes = _mm_loadl_epi64((__m128i *)(y + i));
__m128i cr_bytes = _mm_loadl_epi64((__m128i *)(pcr + i));
__m128i cb_bytes = _mm_loadl_epi64((__m128i *)(pcb + i));
__m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
__m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
// unpack to short (and left-shift cr, cb by 8)
__m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
__m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
__m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
// color transform
__m128i yws = _mm_srli_epi16(yw, 4);
__m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
__m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
__m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
__m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
__m128i rws = _mm_add_epi16(cr0, yws);
__m128i gwt = _mm_add_epi16(cb0, yws);
__m128i bws = _mm_add_epi16(yws, cb1);
__m128i gws = _mm_add_epi16(gwt, cr1);
// descale
__m128i rw = _mm_srai_epi16(rws, 4);
__m128i bw = _mm_srai_epi16(bws, 4);
__m128i gw = _mm_srai_epi16(gws, 4);
// back to byte, set up for transpose
__m128i brb = _mm_packus_epi16(rw, bw);
__m128i gxb = _mm_packus_epi16(gw, xw);
// transpose to interleave channels
__m128i t0 = _mm_unpacklo_epi8(brb, gxb);
__m128i t1 = _mm_unpackhi_epi8(brb, gxb);
__m128i o0 = _mm_unpacklo_epi16(t0, t1);
__m128i o1 = _mm_unpackhi_epi16(t0, t1);
// store
_mm_storeu_si128((__m128i *)(out + 0), o0);
_mm_storeu_si128((__m128i *)(out + 16), o1);
out += 32;
}
}
#endif
#ifdef STBI_NEON
// in this version, step=3 support would be easy to add. but is there demand?
if (step == 4) {
// this is a fairly straightforward implementation and not super-optimized.
uint8x8_t signflip = vdup_n_u8(0x80);
int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
for (; i + 7 < count; i += 8) {
// load
uint8x8_t y_bytes = vld1_u8(y + i);
uint8x8_t cr_bytes = vld1_u8(pcr + i);
uint8x8_t cb_bytes = vld1_u8(pcb + i);
int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
// expand to s16
int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
int16x8_t crw = vshll_n_s8(cr_biased, 7);
int16x8_t cbw = vshll_n_s8(cb_biased, 7);
// color transform
int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
int16x8_t rws = vaddq_s16(yws, cr0);
int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
int16x8_t bws = vaddq_s16(yws, cb1);
// undo scaling, round, convert to byte
uint8x8x4_t o;
o.val[0] = vqrshrun_n_s16(rws, 4);
o.val[1] = vqrshrun_n_s16(gws, 4);
o.val[2] = vqrshrun_n_s16(bws, 4);
o.val[3] = vdup_n_u8(255);
// store, interleaving r/g/b/a
vst4_u8(out, o);
out += 8 * 4;
}
}
#endif
for (; i < count; ++i) {
int y_fixed = (y[i] << 20) + (1 << 19); // rounding
int r, g, b;
int cr = pcr[i] - 128;
int cb = pcb[i] - 128;
r = y_fixed + cr * stbi__float2fixed(1.40200f);
g = y_fixed + cr * -stbi__float2fixed(0.71414f) +
((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
b = y_fixed + cb * stbi__float2fixed(1.77200f);
r >>= 20;
g >>= 20;
b >>= 20;
if ((unsigned)r > 255) {
if (r < 0)
r = 0;
else
r = 255;
}
if ((unsigned)g > 255) {
if (g < 0)
g = 0;
else
g = 255;
}
if ((unsigned)b > 255) {
if (b < 0)
b = 0;
else
b = 255;
}
out[0] = (stbi_uc)r;
out[1] = (stbi_uc)g;
out[2] = (stbi_uc)b;
out[3] = 255;
out += step;
}
}
#endif
// set up the kernels
static void stbi__setup_jpeg(stbi__jpeg *j) {
j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;

View file

@ -1,93 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
.balign 16
stbi__YCbCr_to_RGB_row$sse2:
.leafprologue
.profilable
xor %eax,%eax
cmp $8,%r8d
jl 1f
xor %eax,%eax
movdqa 2f(%rip),%xmm2
movdqa 3f(%rip),%xmm8
movdqa 4f(%rip),%xmm9
movdqa 5f(%rip),%xmm10
movdqa 6f(%rip),%xmm4
movdqa 7f(%rip),%xmm5
.balign 16
0: movq (%rsi,%rax),%xmm6
movq (%rcx,%rax),%xmm7
movq (%rdx,%rax),%xmm1
movdqa %xmm2,%xmm0
punpcklbw %xmm6,%xmm0
pxor %xmm2,%xmm7
pxor %xmm6,%xmm6
punpcklbw %xmm7,%xmm6
pxor %xmm2,%xmm1
pxor %xmm3,%xmm3
punpcklbw %xmm1,%xmm3
psrlw $4,%xmm0
movdqa %xmm6,%xmm7
pmulhw %xmm8,%xmm7
movdqa %xmm3,%xmm1
pmulhw %xmm9,%xmm1
pmulhw %xmm10,%xmm3
pmulhw %xmm4,%xmm6
paddw %xmm1,%xmm6
paddw %xmm0,%xmm7
paddw %xmm0,%xmm3
paddw %xmm0,%xmm6
psraw $4,%xmm7
psraw $4,%xmm3
packuswb %xmm3,%xmm7
psraw $4,%xmm6
packuswb %xmm5,%xmm6
movdqa %xmm7,%xmm0
punpcklbw %xmm6,%xmm0
punpckhbw %xmm6,%xmm7
movdqa %xmm0,%xmm1
punpcklwd %xmm7,%xmm1
punpckhwd %xmm7,%xmm0
movdqu %xmm1,(%rdi,%rax,4)
movdqu %xmm0,16(%rdi,%rax,4)
add $8,%rax
lea 7(%rax),%r9d
cmp %r8d,%r9d
jl 0b
1: .leafepilogue
.endfn stbi__YCbCr_to_RGB_row$sse2,globl
.rodata.cst16
2: .byte 128,128,128,128,128,128,128,128
.zero 8
3: .short 5743,5743,5743,5743,5743,5743,5743,5743
4: .short 64126,64126,64126,64126,64126,64126,64126,64126
5: .short 7258,7258,7258,7258,7258,7258,7258,7258
6: .short 62611,62611,62611,62611,62611,62611,62611,62611
7: .short 255,255,255,255,255,255,255,255
.end
// These should be better but need to get them to work
3: .short 11485,11485,11485,11485,11485,11485,11485,11485 # JR m=13 99.964387%
4: .short -11277,-11277,-11277,-11277,-11277,-11277,-11277,-11277 # JG m=15 99.935941%
5: .short 14516,14516,14516,14516,14516,14516,14516,14516 # JB m=13 99.947219%
6: .short -23401,-23401,-23401,-23401,-23401,-23401,-23401,-23401 # JG m=15 99.935941%
7: .short 255,255,255,255,255,255,255,255

View file

@ -1,56 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/limits.h"
#include "libc/log/check.h"
#include "libc/log/log.h"
#include "libc/macros.internal.h"
#include "libc/str/str.h"
#include "third_party/stb/internal.h"
/* this is a reduced-precision calculation of YCbCr-to-RGB introduced
to make sure the code produces the same results in both SIMD and scalar */
#define FLOAT2FIXED(x) (((int)((x)*4096.0f + .5f)) << 8)
void stbi__YCbCr_to_RGB_row(unsigned char *out, const unsigned char *y,
const unsigned char *pcb, const unsigned char *pcr,
unsigned count, unsigned step) {
unsigned i;
unsigned char b4[4];
int y_fixed, r, g, b, cr, cb;
CHECK(step == 3 || step == 4);
CHECK_LE(count, INT_MAX / 4 - 4);
for (i = step == 4 ? stbi__YCbCr_to_RGB_row$sse2(out, y, pcb, pcr, count) : 0;
i < count; ++i) {
y_fixed = (y[i] << 20) + (1 << 19); /* rounding */
cr = pcr[i] - 128;
cb = pcb[i] - 128;
r = y_fixed + cr * FLOAT2FIXED(1.40200f);
g = y_fixed + (cr * -FLOAT2FIXED(0.71414f)) +
((cb * -FLOAT2FIXED(0.34414f)) & 0xffff0000);
b = y_fixed + cb * FLOAT2FIXED(1.77200f);
r >>= 20;
g >>= 20;
b >>= 20;
b4[0] = MIN(255, MAX(0, r));
b4[1] = MIN(255, MAX(0, g));
b4[2] = MIN(255, MAX(0, b));
b4[3] = 255;
memcpy(out + i * step, b4, 4);
}
}