Make considerably more progress on AARCH64

- Utilities like pledge.com now build
- kprintf() will no longer balk at 48-bit addresses
- There's a new aarch64-dbg build mode that should work
- gc() and defer() are mostly pacified; avoid using them on aarch64
- THIRD_PART_STB now has Arm Neon intrinsics for fast image handling
This commit is contained in:
Justine Tunney 2023-05-12 22:42:57 -07:00
parent 1bfb3aab1b
commit fd34ef732d
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
91 changed files with 1288 additions and 1192 deletions

View file

@ -16,6 +16,7 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "third_party/stb/stb_image.h"
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/fmt/conv.h"
@ -31,20 +32,33 @@
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
#include "libc/x/x.h"
#include "third_party/stb/internal.h"
#include "third_party/stb/stb_image.h"
#define ROL(w, k) ((w) << (k) | CheckUnsigned(w) >> (sizeof(w) * 8 - (k)))
#include "third_party/aarch64/arm_neon.h"
#include "third_party/intel/ammintrin.internal.h"
asm(".ident\t\"\\n\\n\
stb_image (Public Domain)\\n\
Credit: Sean Barrett, et al.\\n\
http://nothings.org/stb\"");
#ifdef __x86_64__
#define STBI_SSE2
#define idct_block_kernel stbi__idct_simd
#elif defined(__aarch64__)
#define STBI_NEON
#define idct_block_kernel stbi__idct_simd
#else
#define idct_block_kernel stbi__idct_block
#endif
#define ROL(w, k) ((w) << (k) | CheckUnsigned(w) >> (sizeof(w) * 8 - (k)))
#ifndef STBI_REALLOC_SIZED
#define STBI_REALLOC_SIZED(p, oldsz, newsz) realloc(p, newsz)
#endif
typedef unsigned char stbi_uc;
typedef unsigned short stbi_us;
// stbi__context structure is our basic context used by all images, so it
// contains all the IO context, plus some basic image information
typedef struct {
@ -1219,6 +1233,556 @@ forceinline unsigned char stbi__clamp(int x) {
return (unsigned char)x;
}
#define stbi__f2f(x) ((int)(((x)*4096 + 0.5)))
#define stbi__fsh(x) ((x)*4096)
// derived from jidctint -- DCT_ISLOW
#define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7) \
int t0, t1, t2, t3, p1, p2, p3, p4, p5, x0, x1, x2, x3; \
p2 = s2; \
p3 = s6; \
p1 = (p2 + p3) * stbi__f2f(0.5411961f); \
t2 = p1 + p3 * stbi__f2f(-1.847759065f); \
t3 = p1 + p2 * stbi__f2f(0.765366865f); \
p2 = s0; \
p3 = s4; \
t0 = stbi__fsh(p2 + p3); \
t1 = stbi__fsh(p2 - p3); \
x0 = t0 + t3; \
x3 = t0 - t3; \
x1 = t1 + t2; \
x2 = t1 - t2; \
t0 = s7; \
t1 = s5; \
t2 = s3; \
t3 = s1; \
p3 = t0 + t2; \
p4 = t1 + t3; \
p1 = t0 + t3; \
p2 = t1 + t2; \
p5 = (p3 + p4) * stbi__f2f(1.175875602f); \
t0 = t0 * stbi__f2f(0.298631336f); \
t1 = t1 * stbi__f2f(2.053119869f); \
t2 = t2 * stbi__f2f(3.072711026f); \
t3 = t3 * stbi__f2f(1.501321110f); \
p1 = p5 + p1 * stbi__f2f(-0.899976223f); \
p2 = p5 + p2 * stbi__f2f(-2.562915447f); \
p3 = p3 * stbi__f2f(-1.961570560f); \
p4 = p4 * stbi__f2f(-0.390180644f); \
t3 += p1 + p4; \
t2 += p2 + p3; \
t1 += p2 + p4; \
t0 += p1 + p3;
static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64]) {
int i, val[64], *v = val;
stbi_uc *o;
short *d = data;
// columns
for (i = 0; i < 8; ++i, ++d, ++v) {
// if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 &&
d[48] == 0 && d[56] == 0) {
// no shortcut 0 seconds
// (1|2|3|4|5|6|7)==0 0 seconds
// all separate -0.047 seconds
// 1 && 2|3 && 4|5 && 6|7: -0.047 seconds
int dcterm = d[0] * 4;
v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
} else {
STBI__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56])
// constants scaled things up by 1<<12; let's bring them back
// down, but keep 2 extra bits of precision
x0 += 512;
x1 += 512;
x2 += 512;
x3 += 512;
v[0] = (x0 + t3) >> 10;
v[56] = (x0 - t3) >> 10;
v[8] = (x1 + t2) >> 10;
v[48] = (x1 - t2) >> 10;
v[16] = (x2 + t1) >> 10;
v[40] = (x2 - t1) >> 10;
v[24] = (x3 + t0) >> 10;
v[32] = (x3 - t0) >> 10;
}
}
for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) {
// no fast case since the first 1D IDCT spread components out
STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
// constants scaled things up by 1<<12, plus we had 1<<2 from first
// loop, plus horizontal and vertical each scale by sqrt(8) so together
// we've got an extra 1<<3, so 1<<17 total we need to remove.
// so we want to round that, which means adding 0.5 * 1<<17,
// aka 65536. Also, we'll end up with -128 to 127 that we want
// to encode as 0..255 by adding 128, so we'll add that before the shift
x0 += 65536 + (128 << 17);
x1 += 65536 + (128 << 17);
x2 += 65536 + (128 << 17);
x3 += 65536 + (128 << 17);
// tried computing the shifts into temps, or'ing the temps to see
// if any were out of range, but that was slower
o[0] = stbi__clamp((x0 + t3) >> 17);
o[7] = stbi__clamp((x0 - t3) >> 17);
o[1] = stbi__clamp((x1 + t2) >> 17);
o[6] = stbi__clamp((x1 - t2) >> 17);
o[2] = stbi__clamp((x2 + t1) >> 17);
o[5] = stbi__clamp((x2 - t1) >> 17);
o[3] = stbi__clamp((x3 + t0) >> 17);
o[4] = stbi__clamp((x3 - t0) >> 17);
}
}
#ifdef STBI_SSE2
// sse2 integer IDCT. not the fastest possible implementation but it
// produces bit-identical results to the generic C version so it's
// fully "transparent".
static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
// This is constructed to match our regular (generic) integer IDCT exactly.
__m128i row0, row1, row2, row3, row4, row5, row6, row7;
__m128i tmp;
// dot product constant: even elems=x, odd elems=y
#define dct_const(x, y) _mm_setr_epi16((x), (y), (x), (y), (x), (y), (x), (y))
// out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit)
// out(1) = c1[even]*x + c1[odd]*y
#define dct_rot(out0, out1, x, y, c0, c1) \
__m128i c0##lo = _mm_unpacklo_epi16((x), (y)); \
__m128i c0##hi = _mm_unpackhi_epi16((x), (y)); \
__m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
__m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
__m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
__m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
// out = in << 12 (in 16-bit, out 32-bit)
#define dct_widen(out, in) \
__m128i out##_l = \
_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
__m128i out##_h = \
_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
// wide add
#define dct_wadd(out, a, b) \
__m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
__m128i out##_h = _mm_add_epi32(a##_h, b##_h)
// wide sub
#define dct_wsub(out, a, b) \
__m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
__m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
// butterfly a/b, add bias, then shift by "s" and pack
#define dct_bfly32o(out0, out1, a, b, bias, s) \
{ \
__m128i abiased_l = _mm_add_epi32(a##_l, bias); \
__m128i abiased_h = _mm_add_epi32(a##_h, bias); \
dct_wadd(sum, abiased, b); \
dct_wsub(dif, abiased, b); \
out0 = \
_mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
out1 = \
_mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
}
// 8-bit interleave step (for transposes)
#define dct_interleave8(a, b) \
tmp = a; \
a = _mm_unpacklo_epi8(a, b); \
b = _mm_unpackhi_epi8(tmp, b)
// 16-bit interleave step (for transposes)
#define dct_interleave16(a, b) \
tmp = a; \
a = _mm_unpacklo_epi16(a, b); \
b = _mm_unpackhi_epi16(tmp, b)
#define dct_pass(bias, shift) \
{ \
/* even part */ \
dct_rot(t2e, t3e, row2, row6, rot0_0, rot0_1); \
__m128i sum04 = _mm_add_epi16(row0, row4); \
__m128i dif04 = _mm_sub_epi16(row0, row4); \
dct_widen(t0e, sum04); \
dct_widen(t1e, dif04); \
dct_wadd(x0, t0e, t3e); \
dct_wsub(x3, t0e, t3e); \
dct_wadd(x1, t1e, t2e); \
dct_wsub(x2, t1e, t2e); \
/* odd part */ \
dct_rot(y0o, y2o, row7, row3, rot2_0, rot2_1); \
dct_rot(y1o, y3o, row5, row1, rot3_0, rot3_1); \
__m128i sum17 = _mm_add_epi16(row1, row7); \
__m128i sum35 = _mm_add_epi16(row3, row5); \
dct_rot(y4o, y5o, sum17, sum35, rot1_0, rot1_1); \
dct_wadd(x4, y0o, y4o); \
dct_wadd(x5, y1o, y5o); \
dct_wadd(x6, y2o, y5o); \
dct_wadd(x7, y3o, y4o); \
dct_bfly32o(row0, row7, x0, x7, bias, shift); \
dct_bfly32o(row1, row6, x1, x6, bias, shift); \
dct_bfly32o(row2, row5, x2, x5, bias, shift); \
dct_bfly32o(row3, row4, x3, x4, bias, shift); \
}
__m128i rot0_0 = dct_const(stbi__f2f(0.5411961f),
stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
__m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f(0.765366865f),
stbi__f2f(0.5411961f));
__m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f),
stbi__f2f(1.175875602f));
__m128i rot1_1 =
dct_const(stbi__f2f(1.175875602f),
stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
__m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f(0.298631336f),
stbi__f2f(-1.961570560f));
__m128i rot2_1 =
dct_const(stbi__f2f(-1.961570560f),
stbi__f2f(-1.961570560f) + stbi__f2f(3.072711026f));
__m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f(2.053119869f),
stbi__f2f(-0.390180644f));
__m128i rot3_1 =
dct_const(stbi__f2f(-0.390180644f),
stbi__f2f(-0.390180644f) + stbi__f2f(1.501321110f));
// rounding biases in column/row passes, see stbi__idct_block for explanation.
__m128i bias_0 = _mm_set1_epi32(512);
__m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
// load
row0 = _mm_load_si128((const __m128i *)(data + 0 * 8));
row1 = _mm_load_si128((const __m128i *)(data + 1 * 8));
row2 = _mm_load_si128((const __m128i *)(data + 2 * 8));
row3 = _mm_load_si128((const __m128i *)(data + 3 * 8));
row4 = _mm_load_si128((const __m128i *)(data + 4 * 8));
row5 = _mm_load_si128((const __m128i *)(data + 5 * 8));
row6 = _mm_load_si128((const __m128i *)(data + 6 * 8));
row7 = _mm_load_si128((const __m128i *)(data + 7 * 8));
// column pass
dct_pass(bias_0, 10);
{
// 16bit 8x8 transpose pass 1
dct_interleave16(row0, row4);
dct_interleave16(row1, row5);
dct_interleave16(row2, row6);
dct_interleave16(row3, row7);
// transpose pass 2
dct_interleave16(row0, row2);
dct_interleave16(row1, row3);
dct_interleave16(row4, row6);
dct_interleave16(row5, row7);
// transpose pass 3
dct_interleave16(row0, row1);
dct_interleave16(row2, row3);
dct_interleave16(row4, row5);
dct_interleave16(row6, row7);
}
// row pass
dct_pass(bias_1, 17);
{
// pack
__m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
__m128i p1 = _mm_packus_epi16(row2, row3);
__m128i p2 = _mm_packus_epi16(row4, row5);
__m128i p3 = _mm_packus_epi16(row6, row7);
// 8bit 8x8 transpose pass 1
dct_interleave8(p0, p2); // a0e0a1e1...
dct_interleave8(p1, p3); // c0g0c1g1...
// transpose pass 2
dct_interleave8(p0, p1); // a0c0e0g0...
dct_interleave8(p2, p3); // b0d0f0h0...
// transpose pass 3
dct_interleave8(p0, p2); // a0b0c0d0...
dct_interleave8(p1, p3); // a4b4c4d4...
// store
_mm_storel_epi64((__m128i *)out, p0);
out += out_stride;
_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p0, 0x4e));
out += out_stride;
_mm_storel_epi64((__m128i *)out, p2);
out += out_stride;
_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p2, 0x4e));
out += out_stride;
_mm_storel_epi64((__m128i *)out, p1);
out += out_stride;
_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p1, 0x4e));
out += out_stride;
_mm_storel_epi64((__m128i *)out, p3);
out += out_stride;
_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p3, 0x4e));
}
#undef dct_const
#undef dct_rot
#undef dct_widen
#undef dct_wadd
#undef dct_wsub
#undef dct_bfly32o
#undef dct_interleave8
#undef dct_interleave16
#undef dct_pass
}
#endif // STBI_SSE2
#ifdef STBI_NEON
// NEON integer IDCT. should produce bit-identical
// results to the generic C version.
static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
int16x4_t rot0_2 = vdup_n_s16(stbi__f2f(0.765366865f));
int16x4_t rot1_0 = vdup_n_s16(stbi__f2f(1.175875602f));
int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
int16x4_t rot3_0 = vdup_n_s16(stbi__f2f(0.298631336f));
int16x4_t rot3_1 = vdup_n_s16(stbi__f2f(2.053119869f));
int16x4_t rot3_2 = vdup_n_s16(stbi__f2f(3.072711026f));
int16x4_t rot3_3 = vdup_n_s16(stbi__f2f(1.501321110f));
#define dct_long_mul(out, inq, coeff) \
int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
#define dct_long_mac(out, acc, inq, coeff) \
int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
#define dct_widen(out, inq) \
int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
// wide add
#define dct_wadd(out, a, b) \
int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
// wide sub
#define dct_wsub(out, a, b) \
int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
// butterfly a/b, then shift using "shiftop" by "s" and pack
#define dct_bfly32o(out0, out1, a, b, shiftop, s) \
{ \
dct_wadd(sum, a, b); \
dct_wsub(dif, a, b); \
out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
}
#define dct_pass(shiftop, shift) \
{ \
/* even part */ \
int16x8_t sum26 = vaddq_s16(row2, row6); \
dct_long_mul(p1e, sum26, rot0_0); \
dct_long_mac(t2e, p1e, row6, rot0_1); \
dct_long_mac(t3e, p1e, row2, rot0_2); \
int16x8_t sum04 = vaddq_s16(row0, row4); \
int16x8_t dif04 = vsubq_s16(row0, row4); \
dct_widen(t0e, sum04); \
dct_widen(t1e, dif04); \
dct_wadd(x0, t0e, t3e); \
dct_wsub(x3, t0e, t3e); \
dct_wadd(x1, t1e, t2e); \
dct_wsub(x2, t1e, t2e); \
/* odd part */ \
int16x8_t sum15 = vaddq_s16(row1, row5); \
int16x8_t sum17 = vaddq_s16(row1, row7); \
int16x8_t sum35 = vaddq_s16(row3, row5); \
int16x8_t sum37 = vaddq_s16(row3, row7); \
int16x8_t sumodd = vaddq_s16(sum17, sum35); \
dct_long_mul(p5o, sumodd, rot1_0); \
dct_long_mac(p1o, p5o, sum17, rot1_1); \
dct_long_mac(p2o, p5o, sum35, rot1_2); \
dct_long_mul(p3o, sum37, rot2_0); \
dct_long_mul(p4o, sum15, rot2_1); \
dct_wadd(sump13o, p1o, p3o); \
dct_wadd(sump24o, p2o, p4o); \
dct_wadd(sump23o, p2o, p3o); \
dct_wadd(sump14o, p1o, p4o); \
dct_long_mac(x4, sump13o, row7, rot3_0); \
dct_long_mac(x5, sump24o, row5, rot3_1); \
dct_long_mac(x6, sump23o, row3, rot3_2); \
dct_long_mac(x7, sump14o, row1, rot3_3); \
dct_bfly32o(row0, row7, x0, x7, shiftop, shift); \
dct_bfly32o(row1, row6, x1, x6, shiftop, shift); \
dct_bfly32o(row2, row5, x2, x5, shiftop, shift); \
dct_bfly32o(row3, row4, x3, x4, shiftop, shift); \
}
// load
row0 = vld1q_s16(data + 0 * 8);
row1 = vld1q_s16(data + 1 * 8);
row2 = vld1q_s16(data + 2 * 8);
row3 = vld1q_s16(data + 3 * 8);
row4 = vld1q_s16(data + 4 * 8);
row5 = vld1q_s16(data + 5 * 8);
row6 = vld1q_s16(data + 6 * 8);
row7 = vld1q_s16(data + 7 * 8);
// add DC bias
row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
// column pass
dct_pass(vrshrn_n_s32, 10);
// 16bit 8x8 transpose
{
// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
// whether compilers actually get this is another story, sadly.
#define dct_trn16(x, y) \
{ \
int16x8x2_t t = vtrnq_s16(x, y); \
x = t.val[0]; \
y = t.val[1]; \
}
#define dct_trn32(x, y) \
{ \
int32x4x2_t t = \
vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); \
x = vreinterpretq_s16_s32(t.val[0]); \
y = vreinterpretq_s16_s32(t.val[1]); \
}
#define dct_trn64(x, y) \
{ \
int16x8_t x0 = x; \
int16x8_t y0 = y; \
x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); \
y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); \
}
// pass 1
dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
dct_trn16(row2, row3);
dct_trn16(row4, row5);
dct_trn16(row6, row7);
// pass 2
dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
dct_trn32(row1, row3);
dct_trn32(row4, row6);
dct_trn32(row5, row7);
// pass 3
dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
dct_trn64(row1, row5);
dct_trn64(row2, row6);
dct_trn64(row3, row7);
#undef dct_trn16
#undef dct_trn32
#undef dct_trn64
}
// row pass
// vrshrn_n_s32 only supports shifts up to 16, we need
// 17. so do a non-rounding shift of 16 first then follow
// up with a rounding shift by 1.
dct_pass(vshrn_n_s32, 16);
{
// pack and round
uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
// again, these can translate into one instruction, but often don't.
#define dct_trn8_8(x, y) \
{ \
uint8x8x2_t t = vtrn_u8(x, y); \
x = t.val[0]; \
y = t.val[1]; \
}
#define dct_trn8_16(x, y) \
{ \
uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); \
x = vreinterpret_u8_u16(t.val[0]); \
y = vreinterpret_u8_u16(t.val[1]); \
}
#define dct_trn8_32(x, y) \
{ \
uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); \
x = vreinterpret_u8_u32(t.val[0]); \
y = vreinterpret_u8_u32(t.val[1]); \
}
// sadly can't use interleaved stores here since we only write
// 8 bytes to each scan line!
// 8x8 8-bit transpose pass 1
dct_trn8_8(p0, p1);
dct_trn8_8(p2, p3);
dct_trn8_8(p4, p5);
dct_trn8_8(p6, p7);
// pass 2
dct_trn8_16(p0, p2);
dct_trn8_16(p1, p3);
dct_trn8_16(p4, p6);
dct_trn8_16(p5, p7);
// pass 3
dct_trn8_32(p0, p4);
dct_trn8_32(p1, p5);
dct_trn8_32(p2, p6);
dct_trn8_32(p3, p7);
// store
vst1_u8(out, p0);
out += out_stride;
vst1_u8(out, p1);
out += out_stride;
vst1_u8(out, p2);
out += out_stride;
vst1_u8(out, p3);
out += out_stride;
vst1_u8(out, p4);
out += out_stride;
vst1_u8(out, p5);
out += out_stride;
vst1_u8(out, p6);
out += out_stride;
vst1_u8(out, p7);
#undef dct_trn8_8
#undef dct_trn8_16
#undef dct_trn8_32
}
#undef dct_long_mul
#undef dct_long_mac
#undef dct_widen
#undef dct_wadd
#undef dct_wsub
#undef dct_bfly32o
#undef dct_pass
}
#endif // STBI_NEON
#define STBI__MARKER_none 0xff
// if there's a pending marker from the entropy stream, return that
// otherwise, fetch from the stream and get a marker. if there's no
@ -1275,7 +1839,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg *z) {
z->huff_ac + ha, z->fast_ac[ha], n,
z->dequant[z->img_comp[n].tq]))
return 0;
stbi__idct_simd$sse(
idct_block_kernel(
z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8,
z->img_comp[n].w2, data);
// every data block is an MCU, so countdown the restart interval
@ -1309,7 +1873,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg *z) {
z->huff_ac + ha, z->fast_ac[ha], n,
z->dequant[z->img_comp[n].tq]))
return 0;
stbi__idct_simd$sse(
idct_block_kernel(
z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2,
z->img_comp[n].w2, data);
}
@ -1411,7 +1975,7 @@ static void stbi__jpeg_finish(stbi__jpeg *z) {
short *data =
z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
stbi__idct_simd$sse(
idct_block_kernel(
z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8,
z->img_comp[n].w2, data);
}
@ -1905,6 +2469,203 @@ static unsigned char *stbi__resample_row_nearest(unsigned char *out,
return out;
}
// this is a reduced-precision calculation of YCbCr-to-RGB introduced
// to make sure the code produces the same results in both SIMD and scalar
#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8)
static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y,
const stbi_uc *pcb, const stbi_uc *pcr,
int count, int step) {
int i;
for (i = 0; i < count; ++i) {
int y_fixed = (y[i] << 20) + (1 << 19); // rounding
int r, g, b;
int cr = pcr[i] - 128;
int cb = pcb[i] - 128;
r = y_fixed + cr * stbi__float2fixed(1.40200f);
g = y_fixed + (cr * -stbi__float2fixed(0.71414f)) +
((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
b = y_fixed + cb * stbi__float2fixed(1.77200f);
r >>= 20;
g >>= 20;
b >>= 20;
if ((unsigned)r > 255) {
if (r < 0)
r = 0;
else
r = 255;
}
if ((unsigned)g > 255) {
if (g < 0)
g = 0;
else
g = 255;
}
if ((unsigned)b > 255) {
if (b < 0)
b = 0;
else
b = 255;
}
out[0] = (stbi_uc)r;
out[1] = (stbi_uc)g;
out[2] = (stbi_uc)b;
out[3] = 255;
out += step;
}
}
#if defined(STBI_SSE2) || defined(STBI_NEON)
static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y,
stbi_uc const *pcb, stbi_uc const *pcr,
int count, int step) {
int i = 0;
#ifdef STBI_SSE2
// step == 3 is pretty ugly on the final interleave, and i'm not convinced
// it's useful in practice (you wouldn't use it for textures, for example).
// so just accelerate step == 4 case.
if (step == 4) {
// this is a fairly straightforward implementation and not super-optimized.
__m128i signflip = _mm_set1_epi8(-0x80);
__m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
__m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
__m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
__m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
__m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
__m128i xw = _mm_set1_epi16(255); // alpha channel
for (; i + 7 < count; i += 8) {
// load
__m128i y_bytes = _mm_loadl_epi64((__m128i *)(y + i));
__m128i cr_bytes = _mm_loadl_epi64((__m128i *)(pcr + i));
__m128i cb_bytes = _mm_loadl_epi64((__m128i *)(pcb + i));
__m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
__m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
// unpack to short (and left-shift cr, cb by 8)
__m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
__m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
__m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
// color transform
__m128i yws = _mm_srli_epi16(yw, 4);
__m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
__m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
__m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
__m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
__m128i rws = _mm_add_epi16(cr0, yws);
__m128i gwt = _mm_add_epi16(cb0, yws);
__m128i bws = _mm_add_epi16(yws, cb1);
__m128i gws = _mm_add_epi16(gwt, cr1);
// descale
__m128i rw = _mm_srai_epi16(rws, 4);
__m128i bw = _mm_srai_epi16(bws, 4);
__m128i gw = _mm_srai_epi16(gws, 4);
// back to byte, set up for transpose
__m128i brb = _mm_packus_epi16(rw, bw);
__m128i gxb = _mm_packus_epi16(gw, xw);
// transpose to interleave channels
__m128i t0 = _mm_unpacklo_epi8(brb, gxb);
__m128i t1 = _mm_unpackhi_epi8(brb, gxb);
__m128i o0 = _mm_unpacklo_epi16(t0, t1);
__m128i o1 = _mm_unpackhi_epi16(t0, t1);
// store
_mm_storeu_si128((__m128i *)(out + 0), o0);
_mm_storeu_si128((__m128i *)(out + 16), o1);
out += 32;
}
}
#endif
#ifdef STBI_NEON
// in this version, step=3 support would be easy to add. but is there demand?
if (step == 4) {
// this is a fairly straightforward implementation and not super-optimized.
uint8x8_t signflip = vdup_n_u8(0x80);
int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
for (; i + 7 < count; i += 8) {
// load
uint8x8_t y_bytes = vld1_u8(y + i);
uint8x8_t cr_bytes = vld1_u8(pcr + i);
uint8x8_t cb_bytes = vld1_u8(pcb + i);
int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
// expand to s16
int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
int16x8_t crw = vshll_n_s8(cr_biased, 7);
int16x8_t cbw = vshll_n_s8(cb_biased, 7);
// color transform
int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
int16x8_t rws = vaddq_s16(yws, cr0);
int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
int16x8_t bws = vaddq_s16(yws, cb1);
// undo scaling, round, convert to byte
uint8x8x4_t o;
o.val[0] = vqrshrun_n_s16(rws, 4);
o.val[1] = vqrshrun_n_s16(gws, 4);
o.val[2] = vqrshrun_n_s16(bws, 4);
o.val[3] = vdup_n_u8(255);
// store, interleaving r/g/b/a
vst4_u8(out, o);
out += 8 * 4;
}
}
#endif
for (; i < count; ++i) {
int y_fixed = (y[i] << 20) + (1 << 19); // rounding
int r, g, b;
int cr = pcr[i] - 128;
int cb = pcb[i] - 128;
r = y_fixed + cr * stbi__float2fixed(1.40200f);
g = y_fixed + cr * -stbi__float2fixed(0.71414f) +
((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
b = y_fixed + cb * stbi__float2fixed(1.77200f);
r >>= 20;
g >>= 20;
b >>= 20;
if ((unsigned)r > 255) {
if (r < 0)
r = 0;
else
r = 255;
}
if ((unsigned)g > 255) {
if (g < 0)
g = 0;
else
g = 255;
}
if ((unsigned)b > 255) {
if (b < 0)
b = 0;
else
b = 255;
}
out[0] = (stbi_uc)r;
out[1] = (stbi_uc)g;
out[2] = (stbi_uc)b;
out[3] = 255;
out += step;
}
}
#endif
// set up the kernels
static void stbi__setup_jpeg(stbi__jpeg *j) {
j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;