mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-07-12 14:09:12 +00:00
Make considerably more progress on AARCH64
- Utilities like pledge.com now build - kprintf() will no longer balk at 48-bit addresses - There's a new aarch64-dbg build mode that should work - gc() and defer() are mostly pacified; avoid using them on aarch64 - THIRD_PART_STB now has Arm Neon intrinsics for fast image handling
This commit is contained in:
parent
1bfb3aab1b
commit
fd34ef732d
91 changed files with 1288 additions and 1192 deletions
2
third_party/bzip2/bz_internal_error.c
vendored
2
third_party/bzip2/bz_internal_error.c
vendored
|
@ -18,5 +18,5 @@
|
|||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
|
||||
void bz_internal_error(int err) {
|
||||
asm("hlt");
|
||||
__builtin_trap();
|
||||
}
|
||||
|
|
2
third_party/bzip2/crctable.c
vendored
2
third_party/bzip2/crctable.c
vendored
|
@ -34,7 +34,7 @@ static textstartup void BZ2_crc32Table_init() {
|
|||
BZ2_crc32Table[i] = u;
|
||||
}
|
||||
if (BZ2_crc32Table[0] || BZ2_crc32Table[255] != 0xb1f740b4) {
|
||||
asm("hlt");
|
||||
__builtin_trap();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
4
third_party/chibicc/chibicc.mk
vendored
4
third_party/chibicc/chibicc.mk
vendored
|
@ -10,6 +10,8 @@
|
|||
# This makefile compiles and runs each test twice. The first with
|
||||
# GCC-built chibicc, and a second time with chibicc-built chibicc
|
||||
|
||||
ifeq ($(ARCH), x86_64)
|
||||
|
||||
CHIBICC = o/$(MODE)/third_party/chibicc/chibicc.com
|
||||
CHIBICC_FLAGS = \
|
||||
-fno-common \
|
||||
|
@ -111,6 +113,8 @@ THIRD_PARTY_CHIBICC_CHECKS = $(foreach x,$(THIRD_PARTY_CHIBICC_ARTIFACTS),$($(x)
|
|||
THIRD_PARTY_CHIBICC_OBJS = $(foreach x,$(THIRD_PARTY_CHIBICC_ARTIFACTS),$($(x)_OBJS))
|
||||
$(THIRD_PARTY_CHIBICC_OBJS): $(BUILD_FILES) third_party/chibicc/chibicc.mk
|
||||
|
||||
endif
|
||||
|
||||
.PHONY: o/$(MODE)/third_party/chibicc
|
||||
o/$(MODE)/third_party/chibicc: \
|
||||
o/$(MODE)/third_party/chibicc/test \
|
||||
|
|
4
third_party/chibicc/test/test.mk
vendored
4
third_party/chibicc/test/test.mk
vendored
|
@ -10,6 +10,8 @@
|
|||
# This makefile compiles and runs each test twice. The first with
|
||||
# GCC-built chibicc, and a second time with chibicc-built chibicc
|
||||
|
||||
ifeq ($(ARCH), x86_64)
|
||||
|
||||
PKGS += THIRD_PARTY_CHIBICC_TEST
|
||||
|
||||
THIRD_PARTY_CHIBICC_TEST_A = o/$(MODE)/third_party/chibicc/test/test.a
|
||||
|
@ -75,6 +77,8 @@ o/$(MODE)/third_party/chibicc/test/%.o: \
|
|||
|
||||
o/$(MODE)/third_party/chibicc/test/int128_test.o: private QUOTA = -M1024m
|
||||
|
||||
endif
|
||||
|
||||
.PHONY: o/$(MODE)/third_party/chibicc/test
|
||||
o/$(MODE)/third_party/chibicc/test: \
|
||||
$(THIRD_PARTY_CHIBICC_TEST_BINS) \
|
||||
|
|
48
third_party/compiler_rt/lshrti3.c
vendored
48
third_party/compiler_rt/lshrti3.c
vendored
|
@ -1,48 +0,0 @@
|
|||
/* clang-format off */
|
||||
/* ===-- lshrti3.c - Implement __lshrti3 -----------------------------------===
|
||||
*
|
||||
* The LLVM Compiler Infrastructure
|
||||
*
|
||||
* This file is dual licensed under the MIT and the University of Illinois Open
|
||||
* Source Licenses. See LICENSE.TXT for details.
|
||||
*
|
||||
* ===----------------------------------------------------------------------===
|
||||
*
|
||||
* This file implements __lshrti3 for the compiler_rt library.
|
||||
*
|
||||
* ===----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
STATIC_YOINK("huge_compiler_rt_license");
|
||||
|
||||
#include "third_party/compiler_rt/int_lib.h"
|
||||
|
||||
#ifdef CRT_HAS_128BIT
|
||||
|
||||
/* Returns: logical a >> b */
|
||||
|
||||
/* Precondition: 0 <= b < bits_in_tword */
|
||||
|
||||
COMPILER_RT_ABI ti_int
|
||||
__lshrti3(ti_int a, si_int b)
|
||||
{
|
||||
const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT);
|
||||
utwords input;
|
||||
utwords result;
|
||||
input.all = a;
|
||||
if (b & bits_in_dword) /* bits_in_dword <= b < bits_in_tword */
|
||||
{
|
||||
result.s.high = 0;
|
||||
result.s.low = input.s.high >> (b - bits_in_dword);
|
||||
}
|
||||
else /* 0 <= b < bits_in_dword */
|
||||
{
|
||||
if (b == 0)
|
||||
return a;
|
||||
result.s.high = input.s.high >> b;
|
||||
result.s.low = (input.s.high << (bits_in_dword - b)) | (input.s.low >> b);
|
||||
}
|
||||
return result.all;
|
||||
}
|
||||
|
||||
#endif /* CRT_HAS_128BIT */
|
1
third_party/gcc/libexec/gcc/aarch64-linux-musl/9.2.0/collect-ld.sym
vendored
Normal file
1
third_party/gcc/libexec/gcc/aarch64-linux-musl/9.2.0/collect-ld.sym
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
../../../../x86_64-linux-musl/bin/ld.bfd
|
24
third_party/sed/process.c
vendored
24
third_party/sed/process.c
vendored
|
@ -68,7 +68,7 @@ static __inline int regexec_e(regex_t *, const char *, int, int, size_t);
|
|||
static void regsub(SPACE *, char *, char *);
|
||||
static int substitute(struct s_command *);
|
||||
|
||||
struct s_appends *appends; /* Array of pointers to strings to append. */
|
||||
struct s_appends *appends_; /* Array of pointers to strings to append. */
|
||||
static size_t appendx; /* Index into appends array. */
|
||||
size_t appendnum; /* Size of appends array. */
|
||||
|
||||
|
@ -111,12 +111,12 @@ redirect:
|
|||
goto redirect;
|
||||
case 'a':
|
||||
if (appendx >= appendnum)
|
||||
appends = xrealloc(appends,
|
||||
appends_ = xrealloc(appends_,
|
||||
sizeof(struct s_appends) *
|
||||
(appendnum *= 2));
|
||||
appends[appendx].type = AP_STRING;
|
||||
appends[appendx].s = cp->t;
|
||||
appends[appendx].len = strlen(cp->t);
|
||||
appends_[appendx].type = AP_STRING;
|
||||
appends_[appendx].s = cp->t;
|
||||
appends_[appendx].len = strlen(cp->t);
|
||||
appendx++;
|
||||
break;
|
||||
case 'b':
|
||||
|
@ -204,12 +204,12 @@ redirect:
|
|||
exit(0);
|
||||
case 'r':
|
||||
if (appendx >= appendnum)
|
||||
appends = xrealloc(appends,
|
||||
appends_ = xrealloc(appends_,
|
||||
sizeof(struct s_appends) *
|
||||
(appendnum *= 2));
|
||||
appends[appendx].type = AP_FILE;
|
||||
appends[appendx].s = cp->t;
|
||||
appends[appendx].len = strlen(cp->t);
|
||||
appends_[appendx].type = AP_FILE;
|
||||
appends_[appendx].s = cp->t;
|
||||
appends_[appendx].len = strlen(cp->t);
|
||||
appendx++;
|
||||
break;
|
||||
case 's':
|
||||
|
@ -541,9 +541,9 @@ flush_appends(void)
|
|||
char *buf = gc(malloc(8 * 1024));
|
||||
|
||||
for (i = 0; i < appendx; i++)
|
||||
switch (appends[i].type) {
|
||||
switch (appends_[i].type) {
|
||||
case AP_STRING:
|
||||
fwrite(appends[i].s, sizeof(char), appends[i].len,
|
||||
fwrite(appends_[i].s, sizeof(char), appends_[i].len,
|
||||
outfile);
|
||||
break;
|
||||
case AP_FILE:
|
||||
|
@ -555,7 +555,7 @@ flush_appends(void)
|
|||
* would be truly bizarre, but possible. It's probably
|
||||
* not that big a performance win, anyhow.
|
||||
*/
|
||||
if ((f = fopen(appends[i].s, "r")) == NULL)
|
||||
if ((f = fopen(appends_[i].s, "r")) == NULL)
|
||||
break;
|
||||
while ((count = fread(buf, sizeof(char), sizeof(buf), f)))
|
||||
(void)fwrite(buf, sizeof(char), count, outfile);
|
||||
|
|
4
third_party/sqlite3/sqlite3.mk
vendored
4
third_party/sqlite3/sqlite3.mk
vendored
|
@ -58,6 +58,7 @@ THIRD_PARTY_SQLITE3_A_DIRECTDEPS = \
|
|||
LIBC_TIME \
|
||||
LIBC_TINYMATH \
|
||||
LIBC_ZIPOS \
|
||||
THIRD_PARTY_COMPILER_RT \
|
||||
THIRD_PARTY_GDTOA \
|
||||
THIRD_PARTY_LINENOISE \
|
||||
THIRD_PARTY_MUSL \
|
||||
|
@ -79,7 +80,8 @@ o/$(MODE)/third_party/sqlite3/sqlite3.com.dbg: \
|
|||
o/$(MODE)/third_party/sqlite3/sqlite3.com: \
|
||||
o/$(MODE)/third_party/sqlite3/sqlite3.com.dbg \
|
||||
o/$(MODE)/third_party/zip/zip.com \
|
||||
o/$(MODE)/tool/build/symtab.com
|
||||
o/$(MODE)/tool/build/symtab.com \
|
||||
$(VM)
|
||||
@$(MAKE_OBJCOPY)
|
||||
@$(MAKE_SYMTAB_CREATE)
|
||||
@$(MAKE_SYMTAB_ZIP)
|
||||
|
|
426
third_party/stb/idct-sse.S
vendored
426
third_party/stb/idct-sse.S
vendored
|
@ -1,426 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
|
||||
// Computes inverse discrete cosine transform.
|
||||
//
|
||||
// @note used to decode jpeg
|
||||
.p2align 4
|
||||
stbi__idct_simd$sse:
|
||||
push %rbp
|
||||
mov %rsp,%rbp
|
||||
movslq %esi,%rsi
|
||||
lea (%rdi,%rsi),%rax
|
||||
sub $96,%rsp
|
||||
movdqa 32(%rdx),%xmm0
|
||||
movdqa 112(%rdx),%xmm9
|
||||
movdqa 48(%rdx),%xmm1
|
||||
movdqa 80(%rdx),%xmm7
|
||||
movdqa %xmm0,%xmm2
|
||||
punpcklwd 96(%rdx),%xmm2
|
||||
punpckhwd 96(%rdx),%xmm0
|
||||
movdqa %xmm9,%xmm8
|
||||
movdqa 16(%rdx),%xmm5
|
||||
movdqa %xmm2,%xmm3
|
||||
movdqa %xmm2,%xmm6
|
||||
movdqa %xmm0,%xmm2
|
||||
pmaddwd .LC1(%rip),%xmm3
|
||||
movdqa %xmm0,%xmm4
|
||||
pmaddwd .LC1(%rip),%xmm2
|
||||
pmaddwd .LC0(%rip),%xmm4
|
||||
punpckhwd %xmm1,%xmm8
|
||||
pmaddwd .LC0(%rip),%xmm6
|
||||
movaps %xmm3,-48(%rbp)
|
||||
movdqa (%rdx),%xmm3
|
||||
movaps %xmm2,-64(%rbp)
|
||||
movdqa 64(%rdx),%xmm2
|
||||
movdqa %xmm3,%xmm0
|
||||
movaps %xmm4,-32(%rbp)
|
||||
paddw %xmm2,%xmm0
|
||||
psubw %xmm2,%xmm3
|
||||
movaps %xmm6,-16(%rbp)
|
||||
movdqa %xmm0,%xmm4
|
||||
pxor %xmm0,%xmm0
|
||||
movdqa %xmm0,%xmm11
|
||||
movdqa %xmm0,%xmm12
|
||||
movdqa %xmm0,%xmm2
|
||||
punpcklwd %xmm4,%xmm11
|
||||
punpckhwd %xmm3,%xmm12
|
||||
punpcklwd %xmm3,%xmm2
|
||||
movdqa %xmm11,%xmm13
|
||||
movdqa %xmm0,%xmm11
|
||||
movdqa %xmm12,%xmm3
|
||||
punpckhwd %xmm4,%xmm11
|
||||
movdqa %xmm8,%xmm12
|
||||
movdqa %xmm8,%xmm4
|
||||
movdqa %xmm11,%xmm14
|
||||
movdqa %xmm7,%xmm8
|
||||
movdqa %xmm9,%xmm11
|
||||
punpckhwd %xmm5,%xmm8
|
||||
psrad $4,%xmm3
|
||||
punpcklwd %xmm1,%xmm11
|
||||
psrad $4,%xmm13
|
||||
psrad $4,%xmm14
|
||||
movdqa %xmm11,%xmm15
|
||||
movaps %xmm13,-80(%rbp)
|
||||
movdqa %xmm8,%xmm6
|
||||
paddw %xmm7,%xmm1
|
||||
pmaddwd .LC3(%rip),%xmm15
|
||||
movaps %xmm14,-96(%rbp)
|
||||
movdqa %xmm8,%xmm14
|
||||
movdqa %xmm5,%xmm8
|
||||
pmaddwd .LC2(%rip),%xmm11
|
||||
pmaddwd .LC2(%rip),%xmm12
|
||||
paddw %xmm9,%xmm8
|
||||
psrad $4,%xmm2
|
||||
pmaddwd .LC3(%rip),%xmm4
|
||||
pmaddwd .LC5(%rip),%xmm6
|
||||
pmaddwd .LC4(%rip),%xmm14
|
||||
movdqa %xmm4,%xmm10
|
||||
movdqa %xmm7,%xmm4
|
||||
movdqa %xmm8,%xmm7
|
||||
punpcklwd %xmm5,%xmm4
|
||||
punpcklwd %xmm1,%xmm7
|
||||
punpckhwd %xmm1,%xmm8
|
||||
movdqa %xmm4,%xmm13
|
||||
movdqa %xmm7,%xmm9
|
||||
pmaddwd .LC5(%rip),%xmm4
|
||||
pmaddwd .LC6(%rip),%xmm9
|
||||
movdqa %xmm8,%xmm5
|
||||
movdqa %xmm7,%xmm1
|
||||
pmaddwd .LC7(%rip),%xmm8
|
||||
pmaddwd .LC6(%rip),%xmm5
|
||||
movdqa %xmm15,%xmm7
|
||||
paddd %xmm9,%xmm11
|
||||
paddd %xmm9,%xmm4
|
||||
movdqa .LC8(%rip),%xmm9
|
||||
paddd %xmm8,%xmm14
|
||||
paddd %xmm10,%xmm8
|
||||
movdqa -96(%rbp),%xmm10
|
||||
paddd -64(%rbp),%xmm10
|
||||
pmaddwd .LC7(%rip),%xmm1
|
||||
pmaddwd .LC4(%rip),%xmm13
|
||||
paddd %xmm5,%xmm12
|
||||
paddd %xmm5,%xmm6
|
||||
paddd %xmm9,%xmm10
|
||||
movdqa -80(%rbp),%xmm5
|
||||
paddd -48(%rbp),%xmm5
|
||||
paddd %xmm1,%xmm13
|
||||
paddd %xmm1,%xmm7
|
||||
movdqa %xmm10,%xmm1
|
||||
psubd %xmm6,%xmm10
|
||||
paddd %xmm9,%xmm5
|
||||
paddd %xmm6,%xmm1
|
||||
psrad $10,%xmm10
|
||||
movdqa -16(%rbp),%xmm6
|
||||
movdqa %xmm1,%xmm15
|
||||
movdqa %xmm5,%xmm1
|
||||
psubd %xmm4,%xmm5
|
||||
psrad $10,%xmm5
|
||||
paddd %xmm4,%xmm1
|
||||
paddd %xmm2,%xmm6
|
||||
packssdw %xmm10,%xmm5
|
||||
movdqa -32(%rbp),%xmm10
|
||||
paddd %xmm9,%xmm6
|
||||
paddd %xmm9,%xmm2
|
||||
psrad $10,%xmm15
|
||||
psrad $10,%xmm1
|
||||
psubd -16(%rbp),%xmm2
|
||||
paddd %xmm3,%xmm10
|
||||
paddd %xmm9,%xmm3
|
||||
packssdw %xmm15,%xmm1
|
||||
paddd %xmm9,%xmm10
|
||||
psubd -32(%rbp),%xmm3
|
||||
movdqa %xmm10,%xmm4
|
||||
psubd %xmm8,%xmm10
|
||||
paddd %xmm8,%xmm4
|
||||
psrad $10,%xmm10
|
||||
movdqa %xmm4,%xmm15
|
||||
movdqa %xmm6,%xmm4
|
||||
psubd %xmm7,%xmm6
|
||||
psrad $10,%xmm6
|
||||
psrad $10,%xmm15
|
||||
paddd %xmm7,%xmm4
|
||||
movdqa %xmm3,%xmm7
|
||||
psubd %xmm14,%xmm3
|
||||
packssdw %xmm10,%xmm6
|
||||
psrad $10,%xmm3
|
||||
psrad $10,%xmm4
|
||||
paddd %xmm14,%xmm7
|
||||
movdqa %xmm7,%xmm8
|
||||
movdqa %xmm2,%xmm7
|
||||
psubd %xmm13,%xmm2
|
||||
paddd %xmm13,%xmm7
|
||||
psrad $10,%xmm8
|
||||
packssdw %xmm15,%xmm4
|
||||
psrad $10,%xmm7
|
||||
psrad $10,%xmm2
|
||||
packssdw %xmm8,%xmm7
|
||||
movdqa -80(%rbp),%xmm8
|
||||
packssdw %xmm3,%xmm2
|
||||
paddd %xmm9,%xmm8
|
||||
paddd -96(%rbp),%xmm9
|
||||
psubd -48(%rbp),%xmm8
|
||||
psubd -64(%rbp),%xmm9
|
||||
movdqa %xmm8,%xmm3
|
||||
movdqa %xmm9,%xmm10
|
||||
psubd %xmm11,%xmm8
|
||||
paddd %xmm12,%xmm10
|
||||
paddd %xmm11,%xmm3
|
||||
psrad $10,%xmm8
|
||||
psrad $10,%xmm10
|
||||
psrad $10,%xmm3
|
||||
psubd %xmm12,%xmm9
|
||||
psrad $10,%xmm9
|
||||
packssdw %xmm10,%xmm3
|
||||
movdqa %xmm1,%xmm10
|
||||
packssdw %xmm9,%xmm8
|
||||
movdqa %xmm7,%xmm9
|
||||
punpckhwd %xmm6,%xmm7
|
||||
punpcklwd %xmm6,%xmm9
|
||||
punpcklwd %xmm8,%xmm10
|
||||
punpckhwd %xmm8,%xmm1
|
||||
movdqa %xmm3,%xmm6
|
||||
movdqa %xmm4,%xmm8
|
||||
punpckhwd %xmm5,%xmm3
|
||||
punpcklwd %xmm5,%xmm6
|
||||
punpcklwd %xmm2,%xmm8
|
||||
movdqa %xmm3,%xmm5
|
||||
punpckhwd %xmm2,%xmm4
|
||||
movdqa %xmm8,%xmm3
|
||||
movdqa %xmm10,%xmm2
|
||||
punpckhwd %xmm6,%xmm8
|
||||
punpcklwd %xmm6,%xmm3
|
||||
punpcklwd %xmm9,%xmm2
|
||||
movdqa %xmm8,%xmm6
|
||||
movdqa %xmm4,%xmm8
|
||||
punpckhwd %xmm9,%xmm10
|
||||
punpcklwd %xmm5,%xmm8
|
||||
punpckhwd %xmm5,%xmm4
|
||||
movdqa %xmm2,%xmm5
|
||||
punpcklwd %xmm3,%xmm5
|
||||
punpckhwd %xmm3,%xmm2
|
||||
movdqa %xmm1,%xmm15
|
||||
movdqa %xmm10,%xmm3
|
||||
punpckhwd %xmm7,%xmm1
|
||||
punpckhwd %xmm6,%xmm10
|
||||
punpcklwd %xmm6,%xmm3
|
||||
movdqa %xmm1,%xmm6
|
||||
punpckhwd %xmm4,%xmm1
|
||||
punpcklwd %xmm4,%xmm6
|
||||
movdqa %xmm3,%xmm4
|
||||
punpcklwd %xmm7,%xmm15
|
||||
punpcklwd %xmm6,%xmm4
|
||||
punpckhwd %xmm6,%xmm3
|
||||
movdqa %xmm15,%xmm7
|
||||
movdqa %xmm4,%xmm6
|
||||
punpcklwd %xmm8,%xmm7
|
||||
movdqa %xmm3,%xmm11
|
||||
movdqa %xmm4,%xmm12
|
||||
movdqa %xmm3,%xmm4
|
||||
movdqa %xmm5,%xmm3
|
||||
paddw %xmm7,%xmm3
|
||||
movdqa %xmm1,%xmm9
|
||||
punpckhwd %xmm8,%xmm15
|
||||
punpcklwd %xmm10,%xmm9
|
||||
psubw %xmm7,%xmm5
|
||||
movdqa %xmm15,%xmm7
|
||||
movdqa %xmm9,%xmm14
|
||||
punpcklwd %xmm2,%xmm7
|
||||
movdqa %xmm1,%xmm8
|
||||
pmaddwd .LC0(%rip),%xmm6
|
||||
punpckhwd %xmm10,%xmm8
|
||||
paddw %xmm15,%xmm10
|
||||
movaps %xmm6,-16(%rbp)
|
||||
pmaddwd .LC1(%rip),%xmm4
|
||||
movdqa %xmm0,%xmm6
|
||||
pmaddwd .LC0(%rip),%xmm11
|
||||
pmaddwd .LC2(%rip),%xmm14
|
||||
pmaddwd .LC1(%rip),%xmm12
|
||||
pmaddwd .LC3(%rip),%xmm9
|
||||
movaps %xmm4,-64(%rbp)
|
||||
movdqa %xmm3,%xmm4
|
||||
movdqa %xmm0,%xmm3
|
||||
punpckhwd %xmm4,%xmm6
|
||||
punpcklwd %xmm4,%xmm3
|
||||
movdqa %xmm0,%xmm4
|
||||
movaps %xmm11,-32(%rbp)
|
||||
movdqa %xmm6,%xmm13
|
||||
movdqa %xmm15,%xmm6
|
||||
punpcklwd %xmm5,%xmm4
|
||||
movaps %xmm12,-48(%rbp)
|
||||
punpckhwd %xmm2,%xmm6
|
||||
paddw %xmm1,%xmm2
|
||||
punpckhwd %xmm5,%xmm0
|
||||
movdqa %xmm14,%xmm11
|
||||
movdqa %xmm2,%xmm5
|
||||
movdqa %xmm7,%xmm14
|
||||
punpckhwd %xmm10,%xmm2
|
||||
psrad $4,%xmm13
|
||||
punpcklwd %xmm10,%xmm5
|
||||
movaps %xmm13,-80(%rbp)
|
||||
movdqa %xmm8,%xmm12
|
||||
movdqa %xmm5,%xmm10
|
||||
pmaddwd .LC4(%rip),%xmm14
|
||||
pmaddwd .LC6(%rip),%xmm10
|
||||
movdqa %xmm2,%xmm15
|
||||
pmaddwd .LC7(%rip),%xmm5
|
||||
pmaddwd .LC3(%rip),%xmm8
|
||||
pmaddwd .LC5(%rip),%xmm7
|
||||
movdqa %xmm14,%xmm13
|
||||
movdqa %xmm6,%xmm14
|
||||
paddd %xmm5,%xmm13
|
||||
paddd %xmm5,%xmm9
|
||||
pmaddwd .LC5(%rip),%xmm6
|
||||
psrad $4,%xmm3
|
||||
pmaddwd .LC6(%rip),%xmm15
|
||||
paddd %xmm10,%xmm7
|
||||
paddd %xmm10,%xmm11
|
||||
psrad $4,%xmm4
|
||||
pmaddwd .LC2(%rip),%xmm12
|
||||
psrad $4,%xmm0
|
||||
pmaddwd .LC4(%rip),%xmm14
|
||||
pmaddwd .LC7(%rip),%xmm2
|
||||
movdqa -80(%rbp),%xmm5
|
||||
paddd %xmm15,%xmm12
|
||||
paddd -64(%rbp),%xmm5
|
||||
paddd %xmm2,%xmm14
|
||||
paddd %xmm8,%xmm2
|
||||
movdqa -48(%rbp),%xmm8
|
||||
paddd %xmm6,%xmm15
|
||||
movdqa .LC9(%rip),%xmm6
|
||||
paddd %xmm3,%xmm8
|
||||
paddd %xmm6,%xmm8
|
||||
paddd %xmm6,%xmm5
|
||||
movdqa %xmm5,%xmm10
|
||||
movdqa %xmm8,%xmm1
|
||||
psubd %xmm15,%xmm5
|
||||
psubd %xmm7,%xmm8
|
||||
psrad $17,%xmm5
|
||||
paddd %xmm7,%xmm1
|
||||
movdqa -32(%rbp),%xmm7
|
||||
psrad $17,%xmm8
|
||||
paddd %xmm15,%xmm10
|
||||
paddd %xmm6,%xmm3
|
||||
packssdw %xmm5,%xmm8
|
||||
movdqa -16(%rbp),%xmm5
|
||||
paddd %xmm0,%xmm7
|
||||
paddd %xmm6,%xmm0
|
||||
paddd %xmm6,%xmm7
|
||||
psrad $17,%xmm10
|
||||
psubd -32(%rbp),%xmm0
|
||||
paddd %xmm4,%xmm5
|
||||
psrad $17,%xmm1
|
||||
movdqa %xmm7,%xmm15
|
||||
paddd %xmm6,%xmm5
|
||||
packssdw %xmm10,%xmm1
|
||||
psubd %xmm2,%xmm7
|
||||
movdqa %xmm5,%xmm10
|
||||
paddd %xmm6,%xmm4
|
||||
psubd %xmm9,%xmm5
|
||||
psubd -16(%rbp),%xmm4
|
||||
psrad $17,%xmm7
|
||||
paddd %xmm2,%xmm15
|
||||
psrad $17,%xmm5
|
||||
psubd -48(%rbp),%xmm3
|
||||
paddd -80(%rbp),%xmm6
|
||||
packssdw %xmm7,%xmm5
|
||||
movdqa %xmm4,%xmm2
|
||||
movdqa %xmm0,%xmm7
|
||||
psubd -64(%rbp),%xmm6
|
||||
paddd %xmm14,%xmm7
|
||||
psrad $17,%xmm15
|
||||
paddd %xmm13,%xmm2
|
||||
psubd %xmm14,%xmm0
|
||||
psrad $17,%xmm7
|
||||
psubd %xmm13,%xmm4
|
||||
psrad $17,%xmm0
|
||||
paddd %xmm9,%xmm10
|
||||
psrad $17,%xmm2
|
||||
psrad $17,%xmm4
|
||||
packuswb %xmm8,%xmm5
|
||||
packssdw %xmm0,%xmm4
|
||||
packssdw %xmm7,%xmm2
|
||||
movdqa %xmm3,%xmm0
|
||||
movdqa %xmm6,%xmm7
|
||||
psrad $17,%xmm10
|
||||
paddd %xmm11,%xmm0
|
||||
paddd %xmm12,%xmm7
|
||||
psubd %xmm12,%xmm6
|
||||
packssdw %xmm15,%xmm10
|
||||
psubd %xmm11,%xmm3
|
||||
psrad $17,%xmm7
|
||||
packuswb %xmm10,%xmm1
|
||||
psrad $17,%xmm0
|
||||
psrad $17,%xmm6
|
||||
psrad $17,%xmm3
|
||||
packssdw %xmm7,%xmm0
|
||||
packssdw %xmm6,%xmm3
|
||||
packuswb %xmm0,%xmm2
|
||||
movdqa %xmm1,%xmm0
|
||||
packuswb %xmm4,%xmm3
|
||||
movdqa %xmm2,%xmm4
|
||||
punpckhbw %xmm5,%xmm2
|
||||
punpcklbw %xmm3,%xmm0
|
||||
punpcklbw %xmm5,%xmm4
|
||||
punpckhbw %xmm3,%xmm1
|
||||
movdqa %xmm2,%xmm3
|
||||
movdqa %xmm0,%xmm2
|
||||
movdqa %xmm1,%xmm5
|
||||
punpcklbw %xmm4,%xmm2
|
||||
punpckhbw %xmm4,%xmm0
|
||||
punpcklbw %xmm3,%xmm5
|
||||
movdqa %xmm2,%xmm4
|
||||
punpckhbw %xmm5,%xmm2
|
||||
punpckhbw %xmm3,%xmm1
|
||||
punpcklbw %xmm5,%xmm4
|
||||
movdqa %xmm0,%xmm3
|
||||
punpckhbw %xmm1,%xmm0
|
||||
movq %xmm4,(%rdi)
|
||||
pshufd $78,%xmm4,%xmm4
|
||||
punpcklbw %xmm1,%xmm3
|
||||
movq %xmm4,(%rax)
|
||||
add %rsi,%rax
|
||||
movq %xmm2,(%rax)
|
||||
add %rsi,%rax
|
||||
pshufd $78,%xmm2,%xmm2
|
||||
movq %xmm2,(%rax)
|
||||
add %rsi,%rax
|
||||
movq %xmm3,(%rax)
|
||||
add %rsi,%rax
|
||||
pshufd $78,%xmm3,%xmm3
|
||||
movq %xmm3,(%rax)
|
||||
movq %xmm0,(%rax,%rsi)
|
||||
pshufd $78,%xmm0,%xmm0
|
||||
movq %xmm0,(%rax,%rsi,2)
|
||||
leave
|
||||
ret
|
||||
.endfn stbi__idct_simd$sse,globl
|
||||
|
||||
.rodata.cst16
|
||||
.LC0: .value 2217,-5350,2217,-5350,2217,-5350,2217,-5350
|
||||
.LC1: .value 5352,2217,5352,2217,5352,2217,5352,2217
|
||||
.LC2: .value -6811,-8034,-6811,-8034,-6811,-8034,-6811,-8034
|
||||
.LC3: .value -8034,4552,-8034,4552,-8034,4552,-8034,4552
|
||||
.LC4: .value 6813,-1597,6813,-1597,6813,-1597,6813,-1597
|
||||
.LC5: .value -1597,4552,-1597,4552,-1597,4552,-1597,4552
|
||||
.LC6: .value 1131,4816,1131,4816,1131,4816,1131,4816
|
||||
.LC7: .value 4816,-5681,4816,-5681,4816,-5681,4816,-5681
|
||||
.LC8: .long 0x200,0x200,0x200,0x200
|
||||
.LC9: .long 0x1010000,0x1010000,0x1010000,0x1010000
|
19
third_party/stb/internal.h
vendored
19
third_party/stb/internal.h
vendored
|
@ -1,19 +0,0 @@
|
|||
#ifndef COSMOPOLITAN_THIRD_PARTY_STB_INTERNAL_H_
|
||||
#define COSMOPOLITAN_THIRD_PARTY_STB_INTERNAL_H_
|
||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
COSMOPOLITAN_C_START_
|
||||
|
||||
void stbi__YCbCr_to_RGB_row(unsigned char *, const unsigned char *,
|
||||
const unsigned char *, const unsigned char *,
|
||||
unsigned, unsigned) _Hide;
|
||||
int stbi__YCbCr_to_RGB_row$sse2(unsigned char *, const unsigned char *,
|
||||
const unsigned char *, const unsigned char *,
|
||||
unsigned) _Hide;
|
||||
void stbi__idct_simd$sse(unsigned char *out, int out_stride,
|
||||
short data[64]) _Hide;
|
||||
void stbi__idct_simd$avx(unsigned char *out, int out_stride,
|
||||
short data[64]) _Hide;
|
||||
|
||||
COSMOPOLITAN_C_END_
|
||||
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
||||
#endif /* COSMOPOLITAN_THIRD_PARTY_STB_INTERNAL_H_ */
|
775
third_party/stb/stb_image.c
vendored
775
third_party/stb/stb_image.c
vendored
|
@ -16,6 +16,7 @@
|
|||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "third_party/stb/stb_image.h"
|
||||
#include "libc/assert.h"
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/fmt/conv.h"
|
||||
|
@ -31,20 +32,33 @@
|
|||
#include "libc/stdio/stdio.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "libc/x/x.h"
|
||||
#include "third_party/stb/internal.h"
|
||||
#include "third_party/stb/stb_image.h"
|
||||
|
||||
#define ROL(w, k) ((w) << (k) | CheckUnsigned(w) >> (sizeof(w) * 8 - (k)))
|
||||
#include "third_party/aarch64/arm_neon.h"
|
||||
#include "third_party/intel/ammintrin.internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
stb_image (Public Domain)\\n\
|
||||
Credit: Sean Barrett, et al.\\n\
|
||||
http://nothings.org/stb\"");
|
||||
|
||||
#ifdef __x86_64__
|
||||
#define STBI_SSE2
|
||||
#define idct_block_kernel stbi__idct_simd
|
||||
#elif defined(__aarch64__)
|
||||
#define STBI_NEON
|
||||
#define idct_block_kernel stbi__idct_simd
|
||||
#else
|
||||
#define idct_block_kernel stbi__idct_block
|
||||
#endif
|
||||
|
||||
#define ROL(w, k) ((w) << (k) | CheckUnsigned(w) >> (sizeof(w) * 8 - (k)))
|
||||
|
||||
#ifndef STBI_REALLOC_SIZED
|
||||
#define STBI_REALLOC_SIZED(p, oldsz, newsz) realloc(p, newsz)
|
||||
#endif
|
||||
|
||||
typedef unsigned char stbi_uc;
|
||||
typedef unsigned short stbi_us;
|
||||
|
||||
// stbi__context structure is our basic context used by all images, so it
|
||||
// contains all the IO context, plus some basic image information
|
||||
typedef struct {
|
||||
|
@ -1219,6 +1233,556 @@ forceinline unsigned char stbi__clamp(int x) {
|
|||
return (unsigned char)x;
|
||||
}
|
||||
|
||||
#define stbi__f2f(x) ((int)(((x)*4096 + 0.5)))
|
||||
#define stbi__fsh(x) ((x)*4096)
|
||||
|
||||
// derived from jidctint -- DCT_ISLOW
|
||||
#define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7) \
|
||||
int t0, t1, t2, t3, p1, p2, p3, p4, p5, x0, x1, x2, x3; \
|
||||
p2 = s2; \
|
||||
p3 = s6; \
|
||||
p1 = (p2 + p3) * stbi__f2f(0.5411961f); \
|
||||
t2 = p1 + p3 * stbi__f2f(-1.847759065f); \
|
||||
t3 = p1 + p2 * stbi__f2f(0.765366865f); \
|
||||
p2 = s0; \
|
||||
p3 = s4; \
|
||||
t0 = stbi__fsh(p2 + p3); \
|
||||
t1 = stbi__fsh(p2 - p3); \
|
||||
x0 = t0 + t3; \
|
||||
x3 = t0 - t3; \
|
||||
x1 = t1 + t2; \
|
||||
x2 = t1 - t2; \
|
||||
t0 = s7; \
|
||||
t1 = s5; \
|
||||
t2 = s3; \
|
||||
t3 = s1; \
|
||||
p3 = t0 + t2; \
|
||||
p4 = t1 + t3; \
|
||||
p1 = t0 + t3; \
|
||||
p2 = t1 + t2; \
|
||||
p5 = (p3 + p4) * stbi__f2f(1.175875602f); \
|
||||
t0 = t0 * stbi__f2f(0.298631336f); \
|
||||
t1 = t1 * stbi__f2f(2.053119869f); \
|
||||
t2 = t2 * stbi__f2f(3.072711026f); \
|
||||
t3 = t3 * stbi__f2f(1.501321110f); \
|
||||
p1 = p5 + p1 * stbi__f2f(-0.899976223f); \
|
||||
p2 = p5 + p2 * stbi__f2f(-2.562915447f); \
|
||||
p3 = p3 * stbi__f2f(-1.961570560f); \
|
||||
p4 = p4 * stbi__f2f(-0.390180644f); \
|
||||
t3 += p1 + p4; \
|
||||
t2 += p2 + p3; \
|
||||
t1 += p2 + p4; \
|
||||
t0 += p1 + p3;
|
||||
|
||||
static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64]) {
|
||||
int i, val[64], *v = val;
|
||||
stbi_uc *o;
|
||||
short *d = data;
|
||||
|
||||
// columns
|
||||
for (i = 0; i < 8; ++i, ++d, ++v) {
|
||||
// if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
|
||||
if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 &&
|
||||
d[48] == 0 && d[56] == 0) {
|
||||
// no shortcut 0 seconds
|
||||
// (1|2|3|4|5|6|7)==0 0 seconds
|
||||
// all separate -0.047 seconds
|
||||
// 1 && 2|3 && 4|5 && 6|7: -0.047 seconds
|
||||
int dcterm = d[0] * 4;
|
||||
v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
|
||||
} else {
|
||||
STBI__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56])
|
||||
// constants scaled things up by 1<<12; let's bring them back
|
||||
// down, but keep 2 extra bits of precision
|
||||
x0 += 512;
|
||||
x1 += 512;
|
||||
x2 += 512;
|
||||
x3 += 512;
|
||||
v[0] = (x0 + t3) >> 10;
|
||||
v[56] = (x0 - t3) >> 10;
|
||||
v[8] = (x1 + t2) >> 10;
|
||||
v[48] = (x1 - t2) >> 10;
|
||||
v[16] = (x2 + t1) >> 10;
|
||||
v[40] = (x2 - t1) >> 10;
|
||||
v[24] = (x3 + t0) >> 10;
|
||||
v[32] = (x3 - t0) >> 10;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) {
|
||||
// no fast case since the first 1D IDCT spread components out
|
||||
STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
|
||||
// constants scaled things up by 1<<12, plus we had 1<<2 from first
|
||||
// loop, plus horizontal and vertical each scale by sqrt(8) so together
|
||||
// we've got an extra 1<<3, so 1<<17 total we need to remove.
|
||||
// so we want to round that, which means adding 0.5 * 1<<17,
|
||||
// aka 65536. Also, we'll end up with -128 to 127 that we want
|
||||
// to encode as 0..255 by adding 128, so we'll add that before the shift
|
||||
x0 += 65536 + (128 << 17);
|
||||
x1 += 65536 + (128 << 17);
|
||||
x2 += 65536 + (128 << 17);
|
||||
x3 += 65536 + (128 << 17);
|
||||
// tried computing the shifts into temps, or'ing the temps to see
|
||||
// if any were out of range, but that was slower
|
||||
o[0] = stbi__clamp((x0 + t3) >> 17);
|
||||
o[7] = stbi__clamp((x0 - t3) >> 17);
|
||||
o[1] = stbi__clamp((x1 + t2) >> 17);
|
||||
o[6] = stbi__clamp((x1 - t2) >> 17);
|
||||
o[2] = stbi__clamp((x2 + t1) >> 17);
|
||||
o[5] = stbi__clamp((x2 - t1) >> 17);
|
||||
o[3] = stbi__clamp((x3 + t0) >> 17);
|
||||
o[4] = stbi__clamp((x3 - t0) >> 17);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef STBI_SSE2
|
||||
// sse2 integer IDCT. not the fastest possible implementation but it
|
||||
// produces bit-identical results to the generic C version so it's
|
||||
// fully "transparent".
|
||||
static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
|
||||
// This is constructed to match our regular (generic) integer IDCT exactly.
|
||||
__m128i row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
__m128i tmp;
|
||||
|
||||
// dot product constant: even elems=x, odd elems=y
|
||||
#define dct_const(x, y) _mm_setr_epi16((x), (y), (x), (y), (x), (y), (x), (y))
|
||||
|
||||
// out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit)
|
||||
// out(1) = c1[even]*x + c1[odd]*y
|
||||
#define dct_rot(out0, out1, x, y, c0, c1) \
|
||||
__m128i c0##lo = _mm_unpacklo_epi16((x), (y)); \
|
||||
__m128i c0##hi = _mm_unpackhi_epi16((x), (y)); \
|
||||
__m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
|
||||
__m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
|
||||
__m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
|
||||
__m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
|
||||
|
||||
// out = in << 12 (in 16-bit, out 32-bit)
|
||||
#define dct_widen(out, in) \
|
||||
__m128i out##_l = \
|
||||
_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
|
||||
__m128i out##_h = \
|
||||
_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
|
||||
|
||||
// wide add
|
||||
#define dct_wadd(out, a, b) \
|
||||
__m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
|
||||
__m128i out##_h = _mm_add_epi32(a##_h, b##_h)
|
||||
|
||||
// wide sub
|
||||
#define dct_wsub(out, a, b) \
|
||||
__m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
|
||||
__m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
|
||||
|
||||
// butterfly a/b, add bias, then shift by "s" and pack
|
||||
#define dct_bfly32o(out0, out1, a, b, bias, s) \
|
||||
{ \
|
||||
__m128i abiased_l = _mm_add_epi32(a##_l, bias); \
|
||||
__m128i abiased_h = _mm_add_epi32(a##_h, bias); \
|
||||
dct_wadd(sum, abiased, b); \
|
||||
dct_wsub(dif, abiased, b); \
|
||||
out0 = \
|
||||
_mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
|
||||
out1 = \
|
||||
_mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
|
||||
}
|
||||
|
||||
// 8-bit interleave step (for transposes)
|
||||
#define dct_interleave8(a, b) \
|
||||
tmp = a; \
|
||||
a = _mm_unpacklo_epi8(a, b); \
|
||||
b = _mm_unpackhi_epi8(tmp, b)
|
||||
|
||||
// 16-bit interleave step (for transposes)
|
||||
#define dct_interleave16(a, b) \
|
||||
tmp = a; \
|
||||
a = _mm_unpacklo_epi16(a, b); \
|
||||
b = _mm_unpackhi_epi16(tmp, b)
|
||||
|
||||
#define dct_pass(bias, shift) \
|
||||
{ \
|
||||
/* even part */ \
|
||||
dct_rot(t2e, t3e, row2, row6, rot0_0, rot0_1); \
|
||||
__m128i sum04 = _mm_add_epi16(row0, row4); \
|
||||
__m128i dif04 = _mm_sub_epi16(row0, row4); \
|
||||
dct_widen(t0e, sum04); \
|
||||
dct_widen(t1e, dif04); \
|
||||
dct_wadd(x0, t0e, t3e); \
|
||||
dct_wsub(x3, t0e, t3e); \
|
||||
dct_wadd(x1, t1e, t2e); \
|
||||
dct_wsub(x2, t1e, t2e); \
|
||||
/* odd part */ \
|
||||
dct_rot(y0o, y2o, row7, row3, rot2_0, rot2_1); \
|
||||
dct_rot(y1o, y3o, row5, row1, rot3_0, rot3_1); \
|
||||
__m128i sum17 = _mm_add_epi16(row1, row7); \
|
||||
__m128i sum35 = _mm_add_epi16(row3, row5); \
|
||||
dct_rot(y4o, y5o, sum17, sum35, rot1_0, rot1_1); \
|
||||
dct_wadd(x4, y0o, y4o); \
|
||||
dct_wadd(x5, y1o, y5o); \
|
||||
dct_wadd(x6, y2o, y5o); \
|
||||
dct_wadd(x7, y3o, y4o); \
|
||||
dct_bfly32o(row0, row7, x0, x7, bias, shift); \
|
||||
dct_bfly32o(row1, row6, x1, x6, bias, shift); \
|
||||
dct_bfly32o(row2, row5, x2, x5, bias, shift); \
|
||||
dct_bfly32o(row3, row4, x3, x4, bias, shift); \
|
||||
}
|
||||
|
||||
__m128i rot0_0 = dct_const(stbi__f2f(0.5411961f),
|
||||
stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
|
||||
__m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f(0.765366865f),
|
||||
stbi__f2f(0.5411961f));
|
||||
__m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f),
|
||||
stbi__f2f(1.175875602f));
|
||||
__m128i rot1_1 =
|
||||
dct_const(stbi__f2f(1.175875602f),
|
||||
stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
|
||||
__m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f(0.298631336f),
|
||||
stbi__f2f(-1.961570560f));
|
||||
__m128i rot2_1 =
|
||||
dct_const(stbi__f2f(-1.961570560f),
|
||||
stbi__f2f(-1.961570560f) + stbi__f2f(3.072711026f));
|
||||
__m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f(2.053119869f),
|
||||
stbi__f2f(-0.390180644f));
|
||||
__m128i rot3_1 =
|
||||
dct_const(stbi__f2f(-0.390180644f),
|
||||
stbi__f2f(-0.390180644f) + stbi__f2f(1.501321110f));
|
||||
|
||||
// rounding biases in column/row passes, see stbi__idct_block for explanation.
|
||||
__m128i bias_0 = _mm_set1_epi32(512);
|
||||
__m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
|
||||
|
||||
// load
|
||||
row0 = _mm_load_si128((const __m128i *)(data + 0 * 8));
|
||||
row1 = _mm_load_si128((const __m128i *)(data + 1 * 8));
|
||||
row2 = _mm_load_si128((const __m128i *)(data + 2 * 8));
|
||||
row3 = _mm_load_si128((const __m128i *)(data + 3 * 8));
|
||||
row4 = _mm_load_si128((const __m128i *)(data + 4 * 8));
|
||||
row5 = _mm_load_si128((const __m128i *)(data + 5 * 8));
|
||||
row6 = _mm_load_si128((const __m128i *)(data + 6 * 8));
|
||||
row7 = _mm_load_si128((const __m128i *)(data + 7 * 8));
|
||||
|
||||
// column pass
|
||||
dct_pass(bias_0, 10);
|
||||
|
||||
{
|
||||
// 16bit 8x8 transpose pass 1
|
||||
dct_interleave16(row0, row4);
|
||||
dct_interleave16(row1, row5);
|
||||
dct_interleave16(row2, row6);
|
||||
dct_interleave16(row3, row7);
|
||||
|
||||
// transpose pass 2
|
||||
dct_interleave16(row0, row2);
|
||||
dct_interleave16(row1, row3);
|
||||
dct_interleave16(row4, row6);
|
||||
dct_interleave16(row5, row7);
|
||||
|
||||
// transpose pass 3
|
||||
dct_interleave16(row0, row1);
|
||||
dct_interleave16(row2, row3);
|
||||
dct_interleave16(row4, row5);
|
||||
dct_interleave16(row6, row7);
|
||||
}
|
||||
|
||||
// row pass
|
||||
dct_pass(bias_1, 17);
|
||||
|
||||
{
|
||||
// pack
|
||||
__m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
|
||||
__m128i p1 = _mm_packus_epi16(row2, row3);
|
||||
__m128i p2 = _mm_packus_epi16(row4, row5);
|
||||
__m128i p3 = _mm_packus_epi16(row6, row7);
|
||||
|
||||
// 8bit 8x8 transpose pass 1
|
||||
dct_interleave8(p0, p2); // a0e0a1e1...
|
||||
dct_interleave8(p1, p3); // c0g0c1g1...
|
||||
|
||||
// transpose pass 2
|
||||
dct_interleave8(p0, p1); // a0c0e0g0...
|
||||
dct_interleave8(p2, p3); // b0d0f0h0...
|
||||
|
||||
// transpose pass 3
|
||||
dct_interleave8(p0, p2); // a0b0c0d0...
|
||||
dct_interleave8(p1, p3); // a4b4c4d4...
|
||||
|
||||
// store
|
||||
_mm_storel_epi64((__m128i *)out, p0);
|
||||
out += out_stride;
|
||||
_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p0, 0x4e));
|
||||
out += out_stride;
|
||||
_mm_storel_epi64((__m128i *)out, p2);
|
||||
out += out_stride;
|
||||
_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p2, 0x4e));
|
||||
out += out_stride;
|
||||
_mm_storel_epi64((__m128i *)out, p1);
|
||||
out += out_stride;
|
||||
_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p1, 0x4e));
|
||||
out += out_stride;
|
||||
_mm_storel_epi64((__m128i *)out, p3);
|
||||
out += out_stride;
|
||||
_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p3, 0x4e));
|
||||
}
|
||||
|
||||
#undef dct_const
|
||||
#undef dct_rot
|
||||
#undef dct_widen
|
||||
#undef dct_wadd
|
||||
#undef dct_wsub
|
||||
#undef dct_bfly32o
|
||||
#undef dct_interleave8
|
||||
#undef dct_interleave16
|
||||
#undef dct_pass
|
||||
}
|
||||
|
||||
#endif // STBI_SSE2
|
||||
|
||||
#ifdef STBI_NEON
|
||||
|
||||
// NEON integer IDCT. should produce bit-identical
|
||||
// results to the generic C version.
|
||||
static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
|
||||
int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
|
||||
int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
|
||||
int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
|
||||
int16x4_t rot0_2 = vdup_n_s16(stbi__f2f(0.765366865f));
|
||||
int16x4_t rot1_0 = vdup_n_s16(stbi__f2f(1.175875602f));
|
||||
int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
|
||||
int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
|
||||
int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
|
||||
int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
|
||||
int16x4_t rot3_0 = vdup_n_s16(stbi__f2f(0.298631336f));
|
||||
int16x4_t rot3_1 = vdup_n_s16(stbi__f2f(2.053119869f));
|
||||
int16x4_t rot3_2 = vdup_n_s16(stbi__f2f(3.072711026f));
|
||||
int16x4_t rot3_3 = vdup_n_s16(stbi__f2f(1.501321110f));
|
||||
|
||||
#define dct_long_mul(out, inq, coeff) \
|
||||
int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
|
||||
int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
|
||||
|
||||
#define dct_long_mac(out, acc, inq, coeff) \
|
||||
int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
|
||||
int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
|
||||
|
||||
#define dct_widen(out, inq) \
|
||||
int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
|
||||
int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
|
||||
|
||||
// wide add
|
||||
#define dct_wadd(out, a, b) \
|
||||
int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
|
||||
int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
|
||||
|
||||
// wide sub
|
||||
#define dct_wsub(out, a, b) \
|
||||
int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
|
||||
int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
|
||||
|
||||
// butterfly a/b, then shift using "shiftop" by "s" and pack
|
||||
#define dct_bfly32o(out0, out1, a, b, shiftop, s) \
|
||||
{ \
|
||||
dct_wadd(sum, a, b); \
|
||||
dct_wsub(dif, a, b); \
|
||||
out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
|
||||
out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
|
||||
}
|
||||
|
||||
#define dct_pass(shiftop, shift) \
|
||||
{ \
|
||||
/* even part */ \
|
||||
int16x8_t sum26 = vaddq_s16(row2, row6); \
|
||||
dct_long_mul(p1e, sum26, rot0_0); \
|
||||
dct_long_mac(t2e, p1e, row6, rot0_1); \
|
||||
dct_long_mac(t3e, p1e, row2, rot0_2); \
|
||||
int16x8_t sum04 = vaddq_s16(row0, row4); \
|
||||
int16x8_t dif04 = vsubq_s16(row0, row4); \
|
||||
dct_widen(t0e, sum04); \
|
||||
dct_widen(t1e, dif04); \
|
||||
dct_wadd(x0, t0e, t3e); \
|
||||
dct_wsub(x3, t0e, t3e); \
|
||||
dct_wadd(x1, t1e, t2e); \
|
||||
dct_wsub(x2, t1e, t2e); \
|
||||
/* odd part */ \
|
||||
int16x8_t sum15 = vaddq_s16(row1, row5); \
|
||||
int16x8_t sum17 = vaddq_s16(row1, row7); \
|
||||
int16x8_t sum35 = vaddq_s16(row3, row5); \
|
||||
int16x8_t sum37 = vaddq_s16(row3, row7); \
|
||||
int16x8_t sumodd = vaddq_s16(sum17, sum35); \
|
||||
dct_long_mul(p5o, sumodd, rot1_0); \
|
||||
dct_long_mac(p1o, p5o, sum17, rot1_1); \
|
||||
dct_long_mac(p2o, p5o, sum35, rot1_2); \
|
||||
dct_long_mul(p3o, sum37, rot2_0); \
|
||||
dct_long_mul(p4o, sum15, rot2_1); \
|
||||
dct_wadd(sump13o, p1o, p3o); \
|
||||
dct_wadd(sump24o, p2o, p4o); \
|
||||
dct_wadd(sump23o, p2o, p3o); \
|
||||
dct_wadd(sump14o, p1o, p4o); \
|
||||
dct_long_mac(x4, sump13o, row7, rot3_0); \
|
||||
dct_long_mac(x5, sump24o, row5, rot3_1); \
|
||||
dct_long_mac(x6, sump23o, row3, rot3_2); \
|
||||
dct_long_mac(x7, sump14o, row1, rot3_3); \
|
||||
dct_bfly32o(row0, row7, x0, x7, shiftop, shift); \
|
||||
dct_bfly32o(row1, row6, x1, x6, shiftop, shift); \
|
||||
dct_bfly32o(row2, row5, x2, x5, shiftop, shift); \
|
||||
dct_bfly32o(row3, row4, x3, x4, shiftop, shift); \
|
||||
}
|
||||
|
||||
// load
|
||||
row0 = vld1q_s16(data + 0 * 8);
|
||||
row1 = vld1q_s16(data + 1 * 8);
|
||||
row2 = vld1q_s16(data + 2 * 8);
|
||||
row3 = vld1q_s16(data + 3 * 8);
|
||||
row4 = vld1q_s16(data + 4 * 8);
|
||||
row5 = vld1q_s16(data + 5 * 8);
|
||||
row6 = vld1q_s16(data + 6 * 8);
|
||||
row7 = vld1q_s16(data + 7 * 8);
|
||||
|
||||
// add DC bias
|
||||
row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
|
||||
|
||||
// column pass
|
||||
dct_pass(vrshrn_n_s32, 10);
|
||||
|
||||
// 16bit 8x8 transpose
|
||||
{
|
||||
// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
|
||||
// whether compilers actually get this is another story, sadly.
|
||||
#define dct_trn16(x, y) \
|
||||
{ \
|
||||
int16x8x2_t t = vtrnq_s16(x, y); \
|
||||
x = t.val[0]; \
|
||||
y = t.val[1]; \
|
||||
}
|
||||
#define dct_trn32(x, y) \
|
||||
{ \
|
||||
int32x4x2_t t = \
|
||||
vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); \
|
||||
x = vreinterpretq_s16_s32(t.val[0]); \
|
||||
y = vreinterpretq_s16_s32(t.val[1]); \
|
||||
}
|
||||
#define dct_trn64(x, y) \
|
||||
{ \
|
||||
int16x8_t x0 = x; \
|
||||
int16x8_t y0 = y; \
|
||||
x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); \
|
||||
y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); \
|
||||
}
|
||||
|
||||
// pass 1
|
||||
dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
|
||||
dct_trn16(row2, row3);
|
||||
dct_trn16(row4, row5);
|
||||
dct_trn16(row6, row7);
|
||||
|
||||
// pass 2
|
||||
dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
|
||||
dct_trn32(row1, row3);
|
||||
dct_trn32(row4, row6);
|
||||
dct_trn32(row5, row7);
|
||||
|
||||
// pass 3
|
||||
dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
|
||||
dct_trn64(row1, row5);
|
||||
dct_trn64(row2, row6);
|
||||
dct_trn64(row3, row7);
|
||||
|
||||
#undef dct_trn16
|
||||
#undef dct_trn32
|
||||
#undef dct_trn64
|
||||
}
|
||||
|
||||
// row pass
|
||||
// vrshrn_n_s32 only supports shifts up to 16, we need
|
||||
// 17. so do a non-rounding shift of 16 first then follow
|
||||
// up with a rounding shift by 1.
|
||||
dct_pass(vshrn_n_s32, 16);
|
||||
|
||||
{
|
||||
// pack and round
|
||||
uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
|
||||
uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
|
||||
uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
|
||||
uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
|
||||
uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
|
||||
uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
|
||||
uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
|
||||
uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
|
||||
|
||||
// again, these can translate into one instruction, but often don't.
|
||||
#define dct_trn8_8(x, y) \
|
||||
{ \
|
||||
uint8x8x2_t t = vtrn_u8(x, y); \
|
||||
x = t.val[0]; \
|
||||
y = t.val[1]; \
|
||||
}
|
||||
#define dct_trn8_16(x, y) \
|
||||
{ \
|
||||
uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); \
|
||||
x = vreinterpret_u8_u16(t.val[0]); \
|
||||
y = vreinterpret_u8_u16(t.val[1]); \
|
||||
}
|
||||
#define dct_trn8_32(x, y) \
|
||||
{ \
|
||||
uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); \
|
||||
x = vreinterpret_u8_u32(t.val[0]); \
|
||||
y = vreinterpret_u8_u32(t.val[1]); \
|
||||
}
|
||||
|
||||
// sadly can't use interleaved stores here since we only write
|
||||
// 8 bytes to each scan line!
|
||||
|
||||
// 8x8 8-bit transpose pass 1
|
||||
dct_trn8_8(p0, p1);
|
||||
dct_trn8_8(p2, p3);
|
||||
dct_trn8_8(p4, p5);
|
||||
dct_trn8_8(p6, p7);
|
||||
|
||||
// pass 2
|
||||
dct_trn8_16(p0, p2);
|
||||
dct_trn8_16(p1, p3);
|
||||
dct_trn8_16(p4, p6);
|
||||
dct_trn8_16(p5, p7);
|
||||
|
||||
// pass 3
|
||||
dct_trn8_32(p0, p4);
|
||||
dct_trn8_32(p1, p5);
|
||||
dct_trn8_32(p2, p6);
|
||||
dct_trn8_32(p3, p7);
|
||||
|
||||
// store
|
||||
vst1_u8(out, p0);
|
||||
out += out_stride;
|
||||
vst1_u8(out, p1);
|
||||
out += out_stride;
|
||||
vst1_u8(out, p2);
|
||||
out += out_stride;
|
||||
vst1_u8(out, p3);
|
||||
out += out_stride;
|
||||
vst1_u8(out, p4);
|
||||
out += out_stride;
|
||||
vst1_u8(out, p5);
|
||||
out += out_stride;
|
||||
vst1_u8(out, p6);
|
||||
out += out_stride;
|
||||
vst1_u8(out, p7);
|
||||
|
||||
#undef dct_trn8_8
|
||||
#undef dct_trn8_16
|
||||
#undef dct_trn8_32
|
||||
}
|
||||
|
||||
#undef dct_long_mul
|
||||
#undef dct_long_mac
|
||||
#undef dct_widen
|
||||
#undef dct_wadd
|
||||
#undef dct_wsub
|
||||
#undef dct_bfly32o
|
||||
#undef dct_pass
|
||||
}
|
||||
|
||||
#endif // STBI_NEON
|
||||
|
||||
#define STBI__MARKER_none 0xff
|
||||
// if there's a pending marker from the entropy stream, return that
|
||||
// otherwise, fetch from the stream and get a marker. if there's no
|
||||
|
@ -1275,7 +1839,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg *z) {
|
|||
z->huff_ac + ha, z->fast_ac[ha], n,
|
||||
z->dequant[z->img_comp[n].tq]))
|
||||
return 0;
|
||||
stbi__idct_simd$sse(
|
||||
idct_block_kernel(
|
||||
z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8,
|
||||
z->img_comp[n].w2, data);
|
||||
// every data block is an MCU, so countdown the restart interval
|
||||
|
@ -1309,7 +1873,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg *z) {
|
|||
z->huff_ac + ha, z->fast_ac[ha], n,
|
||||
z->dequant[z->img_comp[n].tq]))
|
||||
return 0;
|
||||
stbi__idct_simd$sse(
|
||||
idct_block_kernel(
|
||||
z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2,
|
||||
z->img_comp[n].w2, data);
|
||||
}
|
||||
|
@ -1411,7 +1975,7 @@ static void stbi__jpeg_finish(stbi__jpeg *z) {
|
|||
short *data =
|
||||
z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
|
||||
stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
|
||||
stbi__idct_simd$sse(
|
||||
idct_block_kernel(
|
||||
z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8,
|
||||
z->img_comp[n].w2, data);
|
||||
}
|
||||
|
@ -1905,6 +2469,203 @@ static unsigned char *stbi__resample_row_nearest(unsigned char *out,
|
|||
return out;
|
||||
}
|
||||
|
||||
// this is a reduced-precision calculation of YCbCr-to-RGB introduced
|
||||
// to make sure the code produces the same results in both SIMD and scalar
|
||||
#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8)
|
||||
static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y,
|
||||
const stbi_uc *pcb, const stbi_uc *pcr,
|
||||
int count, int step) {
|
||||
int i;
|
||||
for (i = 0; i < count; ++i) {
|
||||
int y_fixed = (y[i] << 20) + (1 << 19); // rounding
|
||||
int r, g, b;
|
||||
int cr = pcr[i] - 128;
|
||||
int cb = pcb[i] - 128;
|
||||
r = y_fixed + cr * stbi__float2fixed(1.40200f);
|
||||
g = y_fixed + (cr * -stbi__float2fixed(0.71414f)) +
|
||||
((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
|
||||
b = y_fixed + cb * stbi__float2fixed(1.77200f);
|
||||
r >>= 20;
|
||||
g >>= 20;
|
||||
b >>= 20;
|
||||
if ((unsigned)r > 255) {
|
||||
if (r < 0)
|
||||
r = 0;
|
||||
else
|
||||
r = 255;
|
||||
}
|
||||
if ((unsigned)g > 255) {
|
||||
if (g < 0)
|
||||
g = 0;
|
||||
else
|
||||
g = 255;
|
||||
}
|
||||
if ((unsigned)b > 255) {
|
||||
if (b < 0)
|
||||
b = 0;
|
||||
else
|
||||
b = 255;
|
||||
}
|
||||
out[0] = (stbi_uc)r;
|
||||
out[1] = (stbi_uc)g;
|
||||
out[2] = (stbi_uc)b;
|
||||
out[3] = 255;
|
||||
out += step;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(STBI_SSE2) || defined(STBI_NEON)
|
||||
static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y,
|
||||
stbi_uc const *pcb, stbi_uc const *pcr,
|
||||
int count, int step) {
|
||||
int i = 0;
|
||||
|
||||
#ifdef STBI_SSE2
|
||||
// step == 3 is pretty ugly on the final interleave, and i'm not convinced
|
||||
// it's useful in practice (you wouldn't use it for textures, for example).
|
||||
// so just accelerate step == 4 case.
|
||||
if (step == 4) {
|
||||
// this is a fairly straightforward implementation and not super-optimized.
|
||||
__m128i signflip = _mm_set1_epi8(-0x80);
|
||||
__m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
|
||||
__m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
|
||||
__m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
|
||||
__m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
|
||||
__m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
|
||||
__m128i xw = _mm_set1_epi16(255); // alpha channel
|
||||
|
||||
for (; i + 7 < count; i += 8) {
|
||||
// load
|
||||
__m128i y_bytes = _mm_loadl_epi64((__m128i *)(y + i));
|
||||
__m128i cr_bytes = _mm_loadl_epi64((__m128i *)(pcr + i));
|
||||
__m128i cb_bytes = _mm_loadl_epi64((__m128i *)(pcb + i));
|
||||
__m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
|
||||
__m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
|
||||
|
||||
// unpack to short (and left-shift cr, cb by 8)
|
||||
__m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
|
||||
__m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
|
||||
__m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
|
||||
|
||||
// color transform
|
||||
__m128i yws = _mm_srli_epi16(yw, 4);
|
||||
__m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
|
||||
__m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
|
||||
__m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
|
||||
__m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
|
||||
__m128i rws = _mm_add_epi16(cr0, yws);
|
||||
__m128i gwt = _mm_add_epi16(cb0, yws);
|
||||
__m128i bws = _mm_add_epi16(yws, cb1);
|
||||
__m128i gws = _mm_add_epi16(gwt, cr1);
|
||||
|
||||
// descale
|
||||
__m128i rw = _mm_srai_epi16(rws, 4);
|
||||
__m128i bw = _mm_srai_epi16(bws, 4);
|
||||
__m128i gw = _mm_srai_epi16(gws, 4);
|
||||
|
||||
// back to byte, set up for transpose
|
||||
__m128i brb = _mm_packus_epi16(rw, bw);
|
||||
__m128i gxb = _mm_packus_epi16(gw, xw);
|
||||
|
||||
// transpose to interleave channels
|
||||
__m128i t0 = _mm_unpacklo_epi8(brb, gxb);
|
||||
__m128i t1 = _mm_unpackhi_epi8(brb, gxb);
|
||||
__m128i o0 = _mm_unpacklo_epi16(t0, t1);
|
||||
__m128i o1 = _mm_unpackhi_epi16(t0, t1);
|
||||
|
||||
// store
|
||||
_mm_storeu_si128((__m128i *)(out + 0), o0);
|
||||
_mm_storeu_si128((__m128i *)(out + 16), o1);
|
||||
out += 32;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef STBI_NEON
|
||||
// in this version, step=3 support would be easy to add. but is there demand?
|
||||
if (step == 4) {
|
||||
// this is a fairly straightforward implementation and not super-optimized.
|
||||
uint8x8_t signflip = vdup_n_u8(0x80);
|
||||
int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
|
||||
int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
|
||||
int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
|
||||
int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
|
||||
|
||||
for (; i + 7 < count; i += 8) {
|
||||
// load
|
||||
uint8x8_t y_bytes = vld1_u8(y + i);
|
||||
uint8x8_t cr_bytes = vld1_u8(pcr + i);
|
||||
uint8x8_t cb_bytes = vld1_u8(pcb + i);
|
||||
int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
|
||||
int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
|
||||
|
||||
// expand to s16
|
||||
int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
|
||||
int16x8_t crw = vshll_n_s8(cr_biased, 7);
|
||||
int16x8_t cbw = vshll_n_s8(cb_biased, 7);
|
||||
|
||||
// color transform
|
||||
int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
|
||||
int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
|
||||
int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
|
||||
int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
|
||||
int16x8_t rws = vaddq_s16(yws, cr0);
|
||||
int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
|
||||
int16x8_t bws = vaddq_s16(yws, cb1);
|
||||
|
||||
// undo scaling, round, convert to byte
|
||||
uint8x8x4_t o;
|
||||
o.val[0] = vqrshrun_n_s16(rws, 4);
|
||||
o.val[1] = vqrshrun_n_s16(gws, 4);
|
||||
o.val[2] = vqrshrun_n_s16(bws, 4);
|
||||
o.val[3] = vdup_n_u8(255);
|
||||
|
||||
// store, interleaving r/g/b/a
|
||||
vst4_u8(out, o);
|
||||
out += 8 * 4;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; i < count; ++i) {
|
||||
int y_fixed = (y[i] << 20) + (1 << 19); // rounding
|
||||
int r, g, b;
|
||||
int cr = pcr[i] - 128;
|
||||
int cb = pcb[i] - 128;
|
||||
r = y_fixed + cr * stbi__float2fixed(1.40200f);
|
||||
g = y_fixed + cr * -stbi__float2fixed(0.71414f) +
|
||||
((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
|
||||
b = y_fixed + cb * stbi__float2fixed(1.77200f);
|
||||
r >>= 20;
|
||||
g >>= 20;
|
||||
b >>= 20;
|
||||
if ((unsigned)r > 255) {
|
||||
if (r < 0)
|
||||
r = 0;
|
||||
else
|
||||
r = 255;
|
||||
}
|
||||
if ((unsigned)g > 255) {
|
||||
if (g < 0)
|
||||
g = 0;
|
||||
else
|
||||
g = 255;
|
||||
}
|
||||
if ((unsigned)b > 255) {
|
||||
if (b < 0)
|
||||
b = 0;
|
||||
else
|
||||
b = 255;
|
||||
}
|
||||
out[0] = (stbi_uc)r;
|
||||
out[1] = (stbi_uc)g;
|
||||
out[2] = (stbi_uc)b;
|
||||
out[3] = 255;
|
||||
out += step;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// set up the kernels
|
||||
static void stbi__setup_jpeg(stbi__jpeg *j) {
|
||||
j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
|
||||
|
|
93
third_party/stb/ycbcr-sse2.S
vendored
93
third_party/stb/ycbcr-sse2.S
vendored
|
@ -1,93 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
|
||||
.balign 16
|
||||
stbi__YCbCr_to_RGB_row$sse2:
|
||||
.leafprologue
|
||||
.profilable
|
||||
xor %eax,%eax
|
||||
cmp $8,%r8d
|
||||
jl 1f
|
||||
xor %eax,%eax
|
||||
movdqa 2f(%rip),%xmm2
|
||||
movdqa 3f(%rip),%xmm8
|
||||
movdqa 4f(%rip),%xmm9
|
||||
movdqa 5f(%rip),%xmm10
|
||||
movdqa 6f(%rip),%xmm4
|
||||
movdqa 7f(%rip),%xmm5
|
||||
.balign 16
|
||||
0: movq (%rsi,%rax),%xmm6
|
||||
movq (%rcx,%rax),%xmm7
|
||||
movq (%rdx,%rax),%xmm1
|
||||
movdqa %xmm2,%xmm0
|
||||
punpcklbw %xmm6,%xmm0
|
||||
pxor %xmm2,%xmm7
|
||||
pxor %xmm6,%xmm6
|
||||
punpcklbw %xmm7,%xmm6
|
||||
pxor %xmm2,%xmm1
|
||||
pxor %xmm3,%xmm3
|
||||
punpcklbw %xmm1,%xmm3
|
||||
psrlw $4,%xmm0
|
||||
movdqa %xmm6,%xmm7
|
||||
pmulhw %xmm8,%xmm7
|
||||
movdqa %xmm3,%xmm1
|
||||
pmulhw %xmm9,%xmm1
|
||||
pmulhw %xmm10,%xmm3
|
||||
pmulhw %xmm4,%xmm6
|
||||
paddw %xmm1,%xmm6
|
||||
paddw %xmm0,%xmm7
|
||||
paddw %xmm0,%xmm3
|
||||
paddw %xmm0,%xmm6
|
||||
psraw $4,%xmm7
|
||||
psraw $4,%xmm3
|
||||
packuswb %xmm3,%xmm7
|
||||
psraw $4,%xmm6
|
||||
packuswb %xmm5,%xmm6
|
||||
movdqa %xmm7,%xmm0
|
||||
punpcklbw %xmm6,%xmm0
|
||||
punpckhbw %xmm6,%xmm7
|
||||
movdqa %xmm0,%xmm1
|
||||
punpcklwd %xmm7,%xmm1
|
||||
punpckhwd %xmm7,%xmm0
|
||||
movdqu %xmm1,(%rdi,%rax,4)
|
||||
movdqu %xmm0,16(%rdi,%rax,4)
|
||||
add $8,%rax
|
||||
lea 7(%rax),%r9d
|
||||
cmp %r8d,%r9d
|
||||
jl 0b
|
||||
1: .leafepilogue
|
||||
.endfn stbi__YCbCr_to_RGB_row$sse2,globl
|
||||
|
||||
.rodata.cst16
|
||||
2: .byte 128,128,128,128,128,128,128,128
|
||||
.zero 8
|
||||
3: .short 5743,5743,5743,5743,5743,5743,5743,5743
|
||||
4: .short 64126,64126,64126,64126,64126,64126,64126,64126
|
||||
5: .short 7258,7258,7258,7258,7258,7258,7258,7258
|
||||
6: .short 62611,62611,62611,62611,62611,62611,62611,62611
|
||||
7: .short 255,255,255,255,255,255,255,255
|
||||
|
||||
.end
|
||||
// These should be better but need to get them to work
|
||||
3: .short 11485,11485,11485,11485,11485,11485,11485,11485 # J′R m=13 99.964387%
|
||||
4: .short -11277,-11277,-11277,-11277,-11277,-11277,-11277,-11277 # J′G m=15 99.935941%
|
||||
5: .short 14516,14516,14516,14516,14516,14516,14516,14516 # J′B m=13 99.947219%
|
||||
6: .short -23401,-23401,-23401,-23401,-23401,-23401,-23401,-23401 # J′G m=15 99.935941%
|
||||
7: .short 255,255,255,255,255,255,255,255
|
56
third_party/stb/ycbcr.c
vendored
56
third_party/stb/ycbcr.c
vendored
|
@ -1,56 +0,0 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/limits.h"
|
||||
#include "libc/log/check.h"
|
||||
#include "libc/log/log.h"
|
||||
#include "libc/macros.internal.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "third_party/stb/internal.h"
|
||||
|
||||
/* this is a reduced-precision calculation of YCbCr-to-RGB introduced
|
||||
to make sure the code produces the same results in both SIMD and scalar */
|
||||
#define FLOAT2FIXED(x) (((int)((x)*4096.0f + .5f)) << 8)
|
||||
|
||||
void stbi__YCbCr_to_RGB_row(unsigned char *out, const unsigned char *y,
|
||||
const unsigned char *pcb, const unsigned char *pcr,
|
||||
unsigned count, unsigned step) {
|
||||
unsigned i;
|
||||
unsigned char b4[4];
|
||||
int y_fixed, r, g, b, cr, cb;
|
||||
CHECK(step == 3 || step == 4);
|
||||
CHECK_LE(count, INT_MAX / 4 - 4);
|
||||
for (i = step == 4 ? stbi__YCbCr_to_RGB_row$sse2(out, y, pcb, pcr, count) : 0;
|
||||
i < count; ++i) {
|
||||
y_fixed = (y[i] << 20) + (1 << 19); /* rounding */
|
||||
cr = pcr[i] - 128;
|
||||
cb = pcb[i] - 128;
|
||||
r = y_fixed + cr * FLOAT2FIXED(1.40200f);
|
||||
g = y_fixed + (cr * -FLOAT2FIXED(0.71414f)) +
|
||||
((cb * -FLOAT2FIXED(0.34414f)) & 0xffff0000);
|
||||
b = y_fixed + cb * FLOAT2FIXED(1.77200f);
|
||||
r >>= 20;
|
||||
g >>= 20;
|
||||
b >>= 20;
|
||||
b4[0] = MIN(255, MAX(0, r));
|
||||
b4[1] = MIN(255, MAX(0, g));
|
||||
b4[2] = MIN(255, MAX(0, b));
|
||||
b4[3] = 255;
|
||||
memcpy(out + i * step, b4, 4);
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue