Upgrade pl_mpeg

This change gets printvideo working on aarch64. Performance improvements
have been introduced for magikarp decimation on aarch64. The last of the
old portable x86 intrinsics library is gone, but it still lives in Blink
This commit is contained in:
Justine Tunney 2024-09-06 19:02:53 -07:00
parent 5d3b91d8b9
commit d1157d471f
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
46 changed files with 4587 additions and 4449 deletions

View file

@ -16,17 +16,20 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/intrin/packuswb.h"
#include "libc/intrin/paddw.h"
#include "libc/intrin/palignr.h"
#include "libc/intrin/pmaddubsw.h"
#include "libc/intrin/psraw.h"
#include "libc/log/check.h"
#include "libc/log/log.h"
#include "libc/macros.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#include "third_party/intel/immintrin.internal.h"
/**
* Performs 2D Motion Picture Convolution Acceleration by Leveraging SSSE3.
*
* @note H/T John Costella, Jean-Baptiste Joseph Fourier
* @note RIP Huixiang Chen
*/
void *cDecimate2xUint8x8(unsigned long n, unsigned char A[n],
const signed char K[8]) {
#ifdef __x86_64__
#define TAPS 8
#define RATIO 2
#define OFFSET 3
@ -37,62 +40,107 @@
#define LOOKAHEAD (SPREAD - LOOKBEHIND)
#define SCALE 5
#define ROUND (1 << (SCALE - 1))
/**
* Performs 2D Motion Picture Convolution Acceleration by Leveraging SSSE3.
*
* @note H/T John Costella, Jean-Baptiste Joseph Fourier
* @note RIP Huixiang Chen
*/
void *cDecimate2xUint8x8(unsigned long n, unsigned char A[n],
const signed char K[8]) {
short kRound[8] = {ROUND, ROUND, ROUND, ROUND, ROUND, ROUND, ROUND, ROUND};
signed char kMadd1[16] = {K[0], K[1], K[0], K[1], K[0], K[1], K[0], K[1],
K[0], K[1], K[0], K[1], K[0], K[1], K[0], K[1]};
signed char kMadd2[16] = {K[2], K[3], K[2], K[3], K[2], K[3], K[2], K[3],
K[2], K[3], K[2], K[3], K[2], K[3], K[2], K[3]};
signed char kMadd3[16] = {K[4], K[5], K[4], K[5], K[4], K[5], K[4], K[5],
K[4], K[5], K[4], K[5], K[4], K[5], K[4], K[5]};
signed char kMadd4[16] = {K[6], K[7], K[6], K[7], K[6], K[7], K[6], K[7],
K[6], K[7], K[6], K[7], K[6], K[7], K[6], K[7]};
unsigned char bv0[16], bv1[16], bv2[16], bv3[16];
unsigned char in1[16], in2[16], in3[16];
short wv0[8], wv1[8], wv2[8], wv3[8];
__m128i kRound = _mm_set1_epi16(ROUND);
__m128i kMadd1 = _mm_set_epi8(K[1], K[0], K[1], K[0], K[1], K[0], K[1], K[0],
K[1], K[0], K[1], K[0], K[1], K[0], K[1], K[0]);
__m128i kMadd2 = _mm_set_epi8(K[3], K[2], K[3], K[2], K[3], K[2], K[3], K[2],
K[3], K[2], K[3], K[2], K[3], K[2], K[3], K[2]);
__m128i kMadd3 = _mm_set_epi8(K[5], K[4], K[5], K[4], K[5], K[4], K[5], K[4],
K[5], K[4], K[5], K[4], K[5], K[4], K[5], K[4]);
__m128i kMadd4 = _mm_set_epi8(K[7], K[6], K[7], K[6], K[7], K[6], K[7], K[6],
K[7], K[6], K[7], K[6], K[7], K[6], K[7], K[6]);
__m128i bv0, bv1, bv2, bv3;
__m128i in1, in2, in3;
__m128i wv0, wv1, wv2, wv3;
unsigned long i, j, w;
if (n >= STRIDE) {
i = 0;
w = (n + RATIO / 2) / RATIO;
memset(in1, A[0], sizeof(in1));
memset(in2, A[n - 1], 16);
memcpy(in2, A, MIN(16, n));
in1 = _mm_set1_epi8(A[0]);
in2 = _mm_set1_epi8(A[n - 1]);
_mm_storeu_si128((__m128i *)&in2, _mm_loadu_si128((__m128i *)A));
for (; i < w; i += STRIDE) {
j = i * RATIO + 16;
if (j + 16 <= n) {
memcpy(in3, &A[j], 16);
in3 = _mm_loadu_si128((__m128i *)&A[j]);
} else {
memset(in3, A[n - 1], 16);
in3 = _mm_set1_epi8(A[n - 1]);
if (j < n) {
memcpy(in3, &A[j], n - j);
// SSSE3-compatible way to handle partial loads
__m128i mask = _mm_loadu_si128((__m128i *)&A[j]);
__m128i shuffle_mask = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7,
6, 5, 4, 3, 2, 1, 0);
__m128i index = _mm_set1_epi8(n - j);
__m128i cmp = _mm_cmplt_epi8(shuffle_mask, index);
in3 = _mm_or_si128(_mm_and_si128(cmp, mask),
_mm_andnot_si128(cmp, in3));
}
}
palignr(bv0, in2, in1, 13);
palignr(bv1, in2, in1, 15);
palignr(bv2, in3, in2, 1);
palignr(bv3, in3, in2, 3);
pmaddubsw(wv0, bv0, kMadd1);
pmaddubsw(wv1, bv1, kMadd2);
pmaddubsw(wv2, bv2, kMadd3);
pmaddubsw(wv3, bv3, kMadd4);
paddw(wv0, wv0, kRound);
paddw(wv0, wv0, wv1);
paddw(wv0, wv0, wv2);
paddw(wv0, wv0, wv3);
psraw(wv0, wv0, SCALE);
packuswb(bv2, wv0, wv0);
memcpy(&A[i], bv2, STRIDE);
memcpy(in1, in2, 16);
memcpy(in2, in3, 16);
bv0 = _mm_alignr_epi8(in2, in1, 13);
bv1 = _mm_alignr_epi8(in2, in1, 15);
bv2 = _mm_alignr_epi8(in3, in2, 1);
bv3 = _mm_alignr_epi8(in3, in2, 3);
wv0 = _mm_maddubs_epi16(bv0, kMadd1);
wv1 = _mm_maddubs_epi16(bv1, kMadd2);
wv2 = _mm_maddubs_epi16(bv2, kMadd3);
wv3 = _mm_maddubs_epi16(bv3, kMadd4);
wv0 = _mm_add_epi16(wv0, kRound);
wv0 = _mm_add_epi16(wv0, wv1);
wv0 = _mm_add_epi16(wv0, wv2);
wv0 = _mm_add_epi16(wv0, wv3);
wv0 = _mm_srai_epi16(wv0, SCALE);
bv2 = _mm_packus_epi16(wv0, wv0);
_mm_storel_epi64((__m128i *)&A[i], bv2);
in1 = in2;
in2 = in3;
}
}
return A;
#else
long h, i;
if (n < 2)
return A;
unsigned char M[3 + n + 4];
unsigned char *q = M;
q[0] = A[0];
q[1] = A[0];
q[2] = A[0];
memcpy(q + 3, A, n);
q[3 + n + 0] = A[n - 1];
q[3 + n + 1] = A[n - 1];
q[3 + n + 2] = A[n - 1];
q[3 + n + 3] = A[n - 1];
q += 3;
h = (n + 1) >> 1;
for (i = 0; i < h; ++i) {
short x0, x1, x2, x3, x4, x5, x6, x7;
x0 = q[i * 2 - 3];
x1 = q[i * 2 - 2];
x2 = q[i * 2 - 1];
x3 = q[i * 2 + 0];
x4 = q[i * 2 + 1];
x5 = q[i * 2 + 2];
x6 = q[i * 2 + 3];
x7 = q[i * 2 + 4];
x0 *= K[0];
x1 *= K[1];
x2 *= K[2];
x3 *= K[3];
x4 *= K[4];
x5 *= K[5];
x6 *= K[6];
x7 *= K[7];
x0 += x1;
x2 += x3;
x4 += x5;
x6 += x7;
x0 += x2;
x4 += x6;
x0 += x4;
x0 += 1 << 4;
x0 >>= 5;
A[i] = MIN(255, MAX(0, x0));
}
return A;
#endif
}