/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
│                                                                              │
│ Permission to use, copy, modify, and/or distribute this software for         │
│ any purpose with or without fee is hereby granted, provided that the         │
│ above copyright notice and this permission notice appear in all copies.      │
│                                                                              │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
│ PERFORMANCE OF THIS SOFTWARE.                                                │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/intrin/mpsadbw.h"
#include "libc/intrin/pabsb.h"
#include "libc/intrin/pabsd.h"
#include "libc/intrin/pabsw.h"
#include "libc/intrin/packssdw.h"
#include "libc/intrin/packsswb.h"
#include "libc/intrin/packusdw.h"
#include "libc/intrin/packuswb.h"
#include "libc/intrin/paddb.h"
#include "libc/intrin/paddd.h"
#include "libc/intrin/paddq.h"
#include "libc/intrin/paddsb.h"
#include "libc/intrin/paddsw.h"
#include "libc/intrin/paddusb.h"
#include "libc/intrin/paddusw.h"
#include "libc/intrin/paddw.h"
#include "libc/intrin/pand.h"
#include "libc/intrin/pandn.h"
#include "libc/intrin/pavgb.h"
#include "libc/intrin/pavgw.h"
#include "libc/intrin/pcmpeqb.h"
#include "libc/intrin/pcmpeqd.h"
#include "libc/intrin/pcmpeqw.h"
#include "libc/intrin/pcmpgtb.h"
#include "libc/intrin/pcmpgtd.h"
#include "libc/intrin/pcmpgtw.h"
#include "libc/intrin/pdep.h"
#include "libc/intrin/pext.h"
#include "libc/intrin/phaddd.h"
#include "libc/intrin/phaddsw.h"
#include "libc/intrin/phaddw.h"
#include "libc/intrin/phsubd.h"
#include "libc/intrin/phsubsw.h"
#include "libc/intrin/phsubw.h"
#include "libc/intrin/pmaddubsw.h"
#include "libc/intrin/pmaddwd.h"
#include "libc/intrin/pmaxsw.h"
#include "libc/intrin/pmaxub.h"
#include "libc/intrin/pminsw.h"
#include "libc/intrin/pminub.h"
#include "libc/intrin/pmulhrsw.h"
#include "libc/intrin/pmulhuw.h"
#include "libc/intrin/pmulhw.h"
#include "libc/intrin/pmulld.h"
#include "libc/intrin/pmullw.h"
#include "libc/intrin/pmuludq.h"
#include "libc/intrin/por.h"
#include "libc/intrin/psadbw.h"
#include "libc/intrin/pshufb.h"
#include "libc/intrin/pshufd.h"
#include "libc/intrin/pshufhw.h"
#include "libc/intrin/pshuflw.h"
#include "libc/intrin/pshufw.h"
#include "libc/intrin/psignb.h"
#include "libc/intrin/psignd.h"
#include "libc/intrin/psignw.h"
#include "libc/intrin/pslld.h"
#include "libc/intrin/pslldq.h"
#include "libc/intrin/psllq.h"
#include "libc/intrin/psllw.h"
#include "libc/intrin/psrad.h"
#include "libc/intrin/psraw.h"
#include "libc/intrin/psrld.h"
#include "libc/intrin/psrldq.h"
#include "libc/intrin/psrlq.h"
#include "libc/intrin/psrlw.h"
#include "libc/intrin/psubb.h"
#include "libc/intrin/psubd.h"
#include "libc/intrin/psubq.h"
#include "libc/intrin/psubsb.h"
#include "libc/intrin/psubsw.h"
#include "libc/intrin/psubusb.h"
#include "libc/intrin/psubusw.h"
#include "libc/intrin/psubw.h"
#include "libc/intrin/punpckhbw.h"
#include "libc/intrin/punpckhdq.h"
#include "libc/intrin/punpckhqdq.h"
#include "libc/intrin/punpckhwd.h"
#include "libc/intrin/punpcklbw.h"
#include "libc/intrin/punpckldq.h"
#include "libc/intrin/punpcklqdq.h"
#include "libc/intrin/punpcklwd.h"
#include "libc/intrin/pxor.h"
#include "libc/limits.h"
#include "libc/log/check.h"
#include "libc/nexgen32e/kcpuids.h"
#include "libc/rand/lcg.internal.h"
#include "libc/rand/rand.h"
#include "libc/runtime/gc.internal.h"
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
#include "libc/testlib/ezbench.h"
#include "libc/testlib/testlib.h"
#include "libc/x/x.h"
#include "tool/viz/lib/formatstringtable-testlib.h"

uint64_t g_rando = 1;

forceinline uint64_t Rando(void) {
  return KnuthLinearCongruentialGenerator(&g_rando) >> 32 << 32 |
         KnuthLinearCongruentialGenerator(&g_rando) >> 32;
}

dontinline void RngSet(void *mem, size_t size) {
  uint64_t coin;
  DCHECK(size % 8 == 0);
  for (size >>= 3; size--;) {
    coin = Rando();
    memcpy((char *)mem + size * 8, &coin, 8);
  }
}

FIXTURE(intrin, disableHardwareExtensions) {
  memset((/*unconst*/ void *)kCpuids, 0, sizeof(kCpuids));
}

TEST(punpcklwd, test) {
  uint16_t a[8] = {1, 02, 03, 04, 05, 06, 07, 8};
  uint16_t b[8] = {9, 10, 11, 12, 13, 14, 15, 16};
  uint16_t c[8];
  punpcklwd(c, a, b);
  ASSERT_EQ(1, c[0]);
  ASSERT_EQ(9, c[1]);
  ASSERT_EQ(2, c[2]);
  ASSERT_EQ(10, c[3]);
  ASSERT_EQ(3, c[4]);
  ASSERT_EQ(11, c[5]);
  ASSERT_EQ(4, c[6]);
  ASSERT_EQ(12, c[7]);
}

TEST(punpcklwd, pure) {
  uint16_t a[8] = {1, 02, 03, 04, 05, 06, 07, 8};
  uint16_t b[8] = {9, 10, 11, 12, 13, 14, 15, 16};
  uint16_t c[8];
  punpcklwd(c, a, b);
  ASSERT_EQ(1, c[0]);
  ASSERT_EQ(9, c[1]);
  ASSERT_EQ(2, c[2]);
  ASSERT_EQ(10, c[3]);
  ASSERT_EQ(3, c[4]);
  ASSERT_EQ(11, c[5]);
  ASSERT_EQ(4, c[6]);
  ASSERT_EQ(12, c[7]);
}

TEST(punpcklwd, testAlias) {
  uint16_t a[8] = {1, 02, 03, 04, 05, 06, 07, 8};
  uint16_t b[8] = {9, 10, 11, 12, 13, 14, 15, 16};
  punpcklwd(a, a, b);
  ASSERT_EQ(1, a[0]);
  ASSERT_EQ(9, a[1]);
  ASSERT_EQ(2, a[2]);
  ASSERT_EQ(10, a[3]);
  ASSERT_EQ(3, a[4]);
  ASSERT_EQ(11, a[5]);
  ASSERT_EQ(4, a[6]);
  ASSERT_EQ(12, a[7]);
}

TEST(punpcklwd, pureAlias) {
  uint16_t a[8] = {1, 02, 03, 04, 05, 06, 07, 8};
  uint16_t b[8] = {9, 10, 11, 12, 13, 14, 15, 16};
  (punpcklwd)(a, a, b);
  ASSERT_EQ(1, a[0]);
  ASSERT_EQ(9, a[1]);
  ASSERT_EQ(2, a[2]);
  ASSERT_EQ(10, a[3]);
  ASSERT_EQ(3, a[4]);
  ASSERT_EQ(11, a[5]);
  ASSERT_EQ(4, a[6]);
  ASSERT_EQ(12, a[7]);
}

TEST(punpcklwd, testAlias2) {
  uint16_t a[8] = {1, 02, 03, 04, 05, 06, 07, 8};
  uint16_t b[8] = {9, 10, 11, 12, 13, 14, 15, 16};
  punpcklwd(b, a, b);
  ASSERT_EQ(1, b[0]);
  ASSERT_EQ(9, b[1]);
  ASSERT_EQ(2, b[2]);
  ASSERT_EQ(10, b[3]);
  ASSERT_EQ(3, b[4]);
  ASSERT_EQ(11, b[5]);
  ASSERT_EQ(4, b[6]);
  ASSERT_EQ(12, b[7]);
}

TEST(punpcklwd, pureAlias2) {
  uint16_t a[8] = {1, 02, 03, 04, 05, 06, 07, 8};
  uint16_t b[8] = {9, 10, 11, 12, 13, 14, 15, 16};
  (punpcklwd)(b, a, b);
  ASSERT_EQ(1, b[0]);
  ASSERT_EQ(9, b[1]);
  ASSERT_EQ(2, b[2]);
  ASSERT_EQ(10, b[3]);
  ASSERT_EQ(3, b[4]);
  ASSERT_EQ(11, b[5]);
  ASSERT_EQ(4, b[6]);
  ASSERT_EQ(12, b[7]);
}

TEST(punpcklqdq, test) {
  uint64_t a[2] = {1, 2};
  uint64_t b[2] = {3, 4};
  uint64_t c[2];
  punpcklqdq(c, a, b);
  ASSERT_EQ(1, c[0]);
  ASSERT_EQ(3, c[1]);
}

TEST(punpcklqdq, pure) {
  uint64_t a[2] = {1, 2};
  uint64_t b[2] = {3, 4};
  uint64_t c[2];
  (punpcklqdq)(c, a, b);
  ASSERT_EQ(1, c[0]);
  ASSERT_EQ(3, c[1]);
}

TEST(punpcklqdq, testAlias) {
  uint64_t a[2] = {1, 2};
  uint64_t b[2] = {3, 4};
  punpcklqdq(a, a, b);
  ASSERT_EQ(1, a[0]);
  ASSERT_EQ(3, a[1]);
}

TEST(punpcklqdq, pureAlias) {
  uint64_t a[2] = {1, 2};
  uint64_t b[2] = {3, 4};
  (punpcklqdq)(a, a, b);
  ASSERT_EQ(1, a[0]);
  ASSERT_EQ(3, a[1]);
}

TEST(punpckldq, test) {
  uint32_t a[4] = {1, 2, 3, 4};
  uint32_t b[4] = {5, 6, 7, 8};
  uint32_t c[4];
  punpckldq(c, a, b);
  ASSERT_EQ(1, c[0]);
  ASSERT_EQ(5, c[1]);
  ASSERT_EQ(2, c[2]);
  ASSERT_EQ(6, c[3]);
}

TEST(punpckldq, pure) {
  uint32_t a[4] = {1, 2, 3, 4};
  uint32_t b[4] = {5, 6, 7, 8};
  uint32_t c[4];
  punpckldq(c, a, b);
  ASSERT_EQ(1, c[0]);
  ASSERT_EQ(5, c[1]);
  ASSERT_EQ(2, c[2]);
  ASSERT_EQ(6, c[3]);
}

TEST(punpckldq, testAlias) {
  uint32_t a[4] = {1, 2, 3, 4};
  uint32_t b[4] = {5, 6, 7, 8};
  punpckldq(a, a, b);
  ASSERT_EQ(1, a[0]);
  ASSERT_EQ(5, a[1]);
  ASSERT_EQ(2, a[2]);
  ASSERT_EQ(6, a[3]);
}

TEST(punpckldq, pureAlias) {
  uint32_t a[4] = {1, 2, 3, 4};
  uint32_t b[4] = {5, 6, 7, 8};
  (punpckldq)(a, a, b);
  ASSERT_EQ(1, a[0]);
  ASSERT_EQ(5, a[1]);
  ASSERT_EQ(2, a[2]);
  ASSERT_EQ(6, a[3]);
}

TEST(punpckldq, testAlias2) {
  uint32_t a[4] = {1, 2, 3, 4};
  uint32_t b[4] = {5, 6, 7, 8};
  punpckldq(b, a, b);
  ASSERT_EQ(1, b[0]);
  ASSERT_EQ(5, b[1]);
  ASSERT_EQ(2, b[2]);
  ASSERT_EQ(6, b[3]);
}

TEST(punpckldq, pureAlias2) {
  uint32_t a[4] = {1, 2, 3, 4};
  uint32_t b[4] = {5, 6, 7, 8};
  (punpckldq)(b, a, b);
  ASSERT_EQ(1, b[0]);
  ASSERT_EQ(5, b[1]);
  ASSERT_EQ(2, b[2]);
  ASSERT_EQ(6, b[3]);
}

TEST(punpcklqdq, fuzz) {
  int i, j;
  uint64_t x[2], y[2], a[2], b[2];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    punpcklqdq(a, x, y);
    (punpcklqdq)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(punpckldq, fuzz) {
  int i, j;
  uint32_t x[4], y[4], a[4], b[4];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    punpckldq(a, x, y);
    (punpckldq)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(punpcklbw, fuzz) {
  int i, j;
  uint8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    punpcklbw(a, x, y);
    (punpcklbw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(punpckhwd, test) {
  uint16_t a[8] = {1, 02, 03, 04, 05, 06, 07, 8};
  uint16_t b[8] = {9, 10, 11, 12, 13, 14, 15, 16};
  uint16_t c[8];
  punpckhwd(c, a, b);
  EXPECT_EQ(5, c[0]);
  EXPECT_EQ(13, c[1]);
  EXPECT_EQ(6, c[2]);
  EXPECT_EQ(14, c[3]);
  EXPECT_EQ(7, c[4]);
  EXPECT_EQ(15, c[5]);
  EXPECT_EQ(8, c[6]);
  EXPECT_EQ(16, c[7]);
}

TEST(punpckhwd, pure) {
  uint16_t a[8] = {1, 02, 03, 04, 05, 06, 07, 8};
  uint16_t b[8] = {9, 10, 11, 12, 13, 14, 15, 16};
  uint16_t c[8];
  punpckhwd(c, a, b);
  EXPECT_EQ(5, c[0]);
  EXPECT_EQ(13, c[1]);
  EXPECT_EQ(6, c[2]);
  EXPECT_EQ(14, c[3]);
  EXPECT_EQ(7, c[4]);
  EXPECT_EQ(15, c[5]);
  EXPECT_EQ(8, c[6]);
  EXPECT_EQ(16, c[7]);
}

TEST(punpckhwd, testAlias) {
  uint16_t a[8] = {1, 02, 03, 04, 05, 06, 07, 8};
  uint16_t b[8] = {9, 10, 11, 12, 13, 14, 15, 16};
  punpckhwd(a, a, b);
  EXPECT_EQ(5, a[0]);
  EXPECT_EQ(13, a[1]);
  EXPECT_EQ(6, a[2]);
  EXPECT_EQ(14, a[3]);
  EXPECT_EQ(7, a[4]);
  EXPECT_EQ(15, a[5]);
  EXPECT_EQ(8, a[6]);
  EXPECT_EQ(16, a[7]);
}

TEST(punpckhwd, pureAlias) {
  uint16_t a[8] = {1, 02, 03, 04, 05, 06, 07, 8};
  uint16_t b[8] = {9, 10, 11, 12, 13, 14, 15, 16};
  (punpckhwd)(a, a, b);
  EXPECT_EQ(5, a[0]);
  EXPECT_EQ(13, a[1]);
  EXPECT_EQ(6, a[2]);
  EXPECT_EQ(14, a[3]);
  EXPECT_EQ(7, a[4]);
  EXPECT_EQ(15, a[5]);
  EXPECT_EQ(8, a[6]);
  EXPECT_EQ(16, a[7]);
}

TEST(punpckhwd, testAlias2) {
  uint16_t a[8] = {1, 02, 03, 04, 05, 06, 07, 8};
  uint16_t b[8] = {9, 10, 11, 12, 13, 14, 15, 16};
  punpckhwd(b, a, b);
  EXPECT_EQ(5, b[0]);
  EXPECT_EQ(13, b[1]);
  EXPECT_EQ(6, b[2]);
  EXPECT_EQ(14, b[3]);
  EXPECT_EQ(7, b[4]);
  EXPECT_EQ(15, b[5]);
  EXPECT_EQ(8, b[6]);
  EXPECT_EQ(16, b[7]);
}

TEST(punpckhwd, pureAlias2) {
  uint16_t a[8] = {1, 02, 03, 04, 05, 06, 07, 8};
  uint16_t b[8] = {9, 10, 11, 12, 13, 14, 15, 16};
  (punpckhwd)(b, a, b);
  EXPECT_EQ(5, b[0]);
  EXPECT_EQ(13, b[1]);
  EXPECT_EQ(6, b[2]);
  EXPECT_EQ(14, b[3]);
  EXPECT_EQ(7, b[4]);
  EXPECT_EQ(15, b[5]);
  EXPECT_EQ(8, b[6]);
  EXPECT_EQ(16, b[7]);
}

TEST(punpckhqdq, test) {
  uint64_t a[2] = {1, 2};
  uint64_t b[2] = {3, 4};
  uint64_t c[2];
  punpckhqdq(c, a, b);
  EXPECT_EQ(2, c[0]);
  EXPECT_EQ(4, c[1]);
}

TEST(punpckhqdq, pure) {
  uint64_t a[2] = {1, 2};
  uint64_t b[2] = {3, 4};
  uint64_t c[2];
  (punpckhqdq)(c, a, b);
  EXPECT_EQ(2, c[0]);
  EXPECT_EQ(4, c[1]);
}

TEST(punpckhqdq, testAlias) {
  uint64_t a[2] = {1, 2};
  uint64_t b[2] = {3, 4};
  punpckhqdq(a, a, b);
  EXPECT_EQ(2, a[0]);
  EXPECT_EQ(4, a[1]);
}

TEST(punpckhqdq, pureAlias) {
  uint64_t a[2] = {1, 2};
  uint64_t b[2] = {3, 4};
  (punpckhqdq)(a, a, b);
  EXPECT_EQ(2, a[0]);
  EXPECT_EQ(4, a[1]);
}

TEST(punpckhdq, test) {
  uint32_t a[4] = {1, 2, 3, 4};
  uint32_t b[4] = {5, 6, 7, 8};
  uint32_t c[4];
  punpckhdq(c, a, b);
  EXPECT_EQ(3, c[0]);
  EXPECT_EQ(7, c[1]);
  EXPECT_EQ(4, c[2]);
  EXPECT_EQ(8, c[3]);
}

TEST(punpckhdq, pure) {
  uint32_t a[4] = {1, 2, 3, 4};
  uint32_t b[4] = {5, 6, 7, 8};
  uint32_t c[4];
  punpckhdq(c, a, b);
  EXPECT_EQ(3, c[0]);
  EXPECT_EQ(7, c[1]);
  EXPECT_EQ(4, c[2]);
  EXPECT_EQ(8, c[3]);
}

TEST(punpckhdq, testAlias) {
  uint32_t a[4] = {1, 2, 3, 4};
  uint32_t b[4] = {5, 6, 7, 8};
  punpckhdq(a, a, b);
  EXPECT_EQ(3, a[0]);
  EXPECT_EQ(7, a[1]);
  EXPECT_EQ(4, a[2]);
  EXPECT_EQ(8, a[3]);
}

TEST(punpckhdq, pureAlias) {
  uint32_t a[4] = {1, 2, 3, 4};
  uint32_t b[4] = {5, 6, 7, 8};
  (punpckhdq)(a, a, b);
  EXPECT_EQ(3, a[0]);
  EXPECT_EQ(7, a[1]);
  EXPECT_EQ(4, a[2]);
  EXPECT_EQ(8, a[3]);
}

TEST(punpckhdq, testAlias2) {
  uint32_t a[4] = {1, 2, 3, 4};
  uint32_t b[4] = {5, 6, 7, 8};
  punpckhdq(b, a, b);
  EXPECT_EQ(3, b[0]);
  EXPECT_EQ(7, b[1]);
  EXPECT_EQ(4, b[2]);
  EXPECT_EQ(8, b[3]);
}

TEST(punpckhdq, pureAlias2) {
  uint32_t a[4] = {1, 2, 3, 4};
  uint32_t b[4] = {5, 6, 7, 8};
  (punpckhdq)(b, a, b);
  EXPECT_EQ(3, b[0]);
  EXPECT_EQ(7, b[1]);
  EXPECT_EQ(4, b[2]);
  EXPECT_EQ(8, b[3]);
}

TEST(punpckhwd, fuzz) {
  int i, j;
  uint16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    punpckhwd(a, x, y);
    (punpckhwd)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(punpckhqdq, fuzz) {
  int i, j;
  uint64_t x[2], y[2], a[2], b[2];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    punpckhqdq(a, x, y);
    (punpckhqdq)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(punpckhdq, fuzz) {
  int i, j;
  uint32_t x[4], y[4], a[4], b[4];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    punpckhdq(a, x, y);
    (punpckhdq)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(punpckhbw, fuzz) {
  int i, j;
  uint8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    punpckhbw(a, x, y);
    (punpckhbw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psubq, fuzz) {
  int i, j;
  uint64_t x[2], y[2], a[2], b[2];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    psubq(a, x, y);
    (psubq)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psrawv, testSmallShift) {
  int16_t A[8] = {-1, -2, SHRT_MIN, 2};
  uint64_t B[2] = {1};
  psrawv(A, A, B);
  ASSERT_EQ(-1, A[0]);
  ASSERT_EQ(-1, A[1]);
  ASSERT_EQ(-16384, A[2]);
  ASSERT_EQ(1, A[3]);
  ASSERT_EQ(0, A[4]);
}

TEST(psraw, testSmallShift) {
  int16_t A[8] = {-1, -2, SHRT_MIN, 2};
  psraw(A, A, 1);
  ASSERT_EQ(-1, A[0]);
  ASSERT_EQ(-1, A[1]);
  ASSERT_EQ(-16384, A[2]);
  ASSERT_EQ(1, A[3]);
  ASSERT_EQ(0, A[4]);
}

TEST(psraw, pureSmallShift) {
  int16_t A[8] = {-1, -2, SHRT_MIN, 2};
  (psraw)(A, A, 1);
  ASSERT_EQ(-1, A[0]);
  ASSERT_EQ(-1, A[1]);
  ASSERT_EQ(-16384, A[2]);
  ASSERT_EQ(1, A[3]);
  ASSERT_EQ(0, A[4]);
}

TEST(psraw, testBigShift_saturatesCount) {
  int16_t A[8] = {-1, -2, SHRT_MIN, 2};
  psraw(A, A, 77);
  ASSERT_EQ(-1, A[0]);
  ASSERT_EQ(-1, A[1]);
  ASSERT_EQ(-1, A[2]);
  ASSERT_EQ(0, A[3]);
  ASSERT_EQ(0, A[4]);
}

TEST(psraw, pureBigShift_saturatesCount) {
  int16_t A[8] = {-1, -2, SHRT_MIN, 2};
  (psraw)(A, A, 77);
  ASSERT_EQ(-1, A[0]);
  ASSERT_EQ(-1, A[1]);
  ASSERT_EQ(-1, A[2]);
  ASSERT_EQ(0, A[3]);
  ASSERT_EQ(0, A[4]);
}

TEST(psradv, testSmallShift) {
  int32_t A[8] = {-1, -2, INT32_MIN, 2};
  uint64_t B[2] = {1};
  psradv(A, A, B);
  ASSERT_EQ(-1, A[0]);
  ASSERT_EQ(-1, A[1]);
  ASSERT_EQ(-1073741824, A[2]);
  ASSERT_EQ(1, A[3]);
  ASSERT_EQ(0, A[4]);
}

TEST(psradv, test) {
  int i, j;
  int32_t x[4], a[4], b[4];
  uint64_t y[2];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    for (j = 0; j < 2; ++j) {
      y[j] = Rando() % 70;
    }
    psradv(a, x, y);
    (psradv)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psrad, testSmallShift) {
  int32_t A[4] = {-1, -2, INT32_MIN, 2};
  psrad(A, A, 1);
  ASSERT_EQ(-1, A[0]);
  ASSERT_EQ(-1, A[1]);
  ASSERT_EQ(-1073741824, A[2]);
  ASSERT_EQ(1, A[3]);
}

TEST(psrad, pureSmallShift) {
  int32_t A[4] = {-1, -2, INT32_MIN, 2};
  (psrad)(A, A, 1);
  ASSERT_EQ(-1, A[0]);
  ASSERT_EQ(-1, A[1]);
  ASSERT_EQ(-1073741824, A[2]);
  ASSERT_EQ(1, A[3]);
}

TEST(psrad, testBigShift_saturatesCount) {
  int32_t A[4] = {-1, -2, INT32_MIN, 2};
  psrad(A, A, 77);
  ASSERT_EQ(-1, A[0]);
  ASSERT_EQ(-1, A[1]);
  ASSERT_EQ(-1, A[2]);
  ASSERT_EQ(0, A[3]);
}

TEST(psrad, pureBigShift_saturatesCount) {
  int32_t A[4] = {-1, -2, INT32_MIN, 2};
  (psrad)(A, A, 77);
  ASSERT_EQ(-1, A[0]);
  ASSERT_EQ(-1, A[1]);
  ASSERT_EQ(-1, A[2]);
  ASSERT_EQ(0, A[3]);
}

TEST(psllwv, test) {
  int i, j;
  uint16_t x[8], a[8], b[8];
  uint64_t y[2];
  for (i = 0; i < 32; ++i) {
    RngSet(x, sizeof(x));
    for (j = 0; j < 2; ++j) {
      y[j] = Rando() % 300;
    }
    psllwv(a, x, y);
    (psllwv)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psllw, testSmallShift) {
  uint16_t A[8] = {0, 1, 0xffff, 2};
  psllw(A, A, 1);
  ASSERT_EQ(0, A[0]);
  ASSERT_EQ(2, A[1]);
  ASSERT_EQ(0xfffe, A[2]);
  ASSERT_EQ(4, A[3]);
}

TEST(psllwv, testSmallShift) {
  uint16_t A[8] = {0, 1, 0xffff, 2};
  uint64_t B[2] = {1};
  psllwv(A, A, B);
  ASSERT_EQ(0, A[0]);
  ASSERT_EQ(2, A[1]);
  ASSERT_EQ(0xfffe, A[2]);
  ASSERT_EQ(4, A[3]);
}

TEST(pslldv, test) {
  int i, j;
  uint32_t x[4], a[4], b[4];
  uint64_t y[2];
  for (i = 0; i < 32; ++i) {
    RngSet(x, sizeof(x));
    for (j = 0; j < 2; ++j) {
      y[j] = Rando() % 300;
    }
    pslldv(a, x, y);
    (pslldv)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pslld, testSmallShift) {
  uint32_t A[8] = {0, 1, 0xffffffff, 2};
  pslld(A, A, 1);
  ASSERT_EQ(0, A[0]);
  ASSERT_EQ(2, A[1]);
  ASSERT_EQ(0xfffffffe, A[2]);
  ASSERT_EQ(4, A[3]);
}

TEST(pslldv, testSmallShift) {
  uint32_t A[8] = {0, 1, 0xffffffff, 2};
  uint64_t B[2] = {1};
  pslldv(A, A, B);
  ASSERT_EQ(0, A[0]);
  ASSERT_EQ(2, A[1]);
  ASSERT_EQ(0xfffffffe, A[2]);
  ASSERT_EQ(4, A[3]);
}

TEST(pmulhuw, test) {
  uint16_t x[8] = {0, 0xffff, 0x0000, 0x0001, 0x8000};
  uint16_t y[8] = {0, 0xffff, 0xffff, 0xffff, 0x8000};
  uint16_t z[8];
  pmulhuw(z, x, y);
  ASSERT_EQ(0x0000 /*0000*/, z[0]);
  ASSERT_EQ(0xfffe /*0001*/, z[1]);
  ASSERT_EQ(0x0000 /*0000*/, z[2]);
  ASSERT_EQ(0x0000 /*ffff*/, z[3]);
  ASSERT_EQ(0x4000 /*0000*/, z[4]);
}

TEST(pmulhuw, pure) {
  uint16_t x[8] = {0, 0xffff, 0x0000, 0x0001, 0x8000};
  uint16_t y[8] = {0, 0xffff, 0xffff, 0xffff, 0x8000};
  uint16_t z[8];
  (pmulhuw)(z, x, y);
  ASSERT_EQ(0x0000 /*0000*/, z[0]);
  ASSERT_EQ(0xfffe /*0001*/, z[1]);
  ASSERT_EQ(0x0000 /*0000*/, z[2]);
  ASSERT_EQ(0x0000 /*ffff*/, z[3]);
  ASSERT_EQ(0x4000 /*0000*/, z[4]);
}

TEST(pmulhuw, fuzz) {
  int i, j;
  uint16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pmulhuw(a, x, y);
    (pmulhuw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pmulhuw(a, (void *)a, y);
    (pmulhuw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pmulhw, fuzz) {
  int i, j;
  int16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pmulhw(a, x, y);
    (pmulhw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pmulhw(a, (void *)a, y);
    (pmulhw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pmullw, fuzz) {
  int i, j;
  int16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pmullw(a, x, y);
    (pmullw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pmullw(a, (void *)a, y);
    (pmullw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pmulld, fuzz) {
  int i, j;
  uint32_t x[4], y[4], a[4], b[4];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pmulld(a, x, y);
    (pmulld)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pmulld(a, (void *)a, y);
    (pmulld)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pmuludq, fuzz) {
  int i, j;
  uint32_t x[4], y[4];
  uint64_t a[2], b[2];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pmuludq(a, x, y);
    (pmuludq)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pmuludq(a, (void *)a, y);
    (pmuludq)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pmaddwd, fuzz) {
  int i, j;
  int16_t x[8], y[8];
  int32_t a[4], b[4];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pmaddwd(a, x, y);
    (pmaddwd)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pmaddwd(a, (void *)a, y);
    (pmaddwd)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(phaddw, fuzz) {
  int i, j;
  int16_t x[8], y[8];
  int16_t a[8], b[8];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    phaddw(a, x, y);
    (phaddw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    phaddw(a, (void *)a, y);
    (phaddw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(phaddd, fuzz) {
  int i, j;
  uint32_t x[4], y[4];
  uint32_t a[4], b[4];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    phaddd(a, x, y);
    (phaddd)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    phaddd(a, (void *)a, y);
    (phaddd)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(phsubw, fuzz) {
  int i, j;
  int16_t x[8], y[8];
  int16_t a[8], b[8];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    phsubw(a, x, y);
    (phsubw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    phsubw(a, (void *)a, y);
    (phsubw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(phsubd, fuzz) {
  int i, j;
  uint32_t x[4], y[4];
  uint32_t a[4], b[4];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    phsubd(a, x, y);
    (phsubd)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    phsubd(a, (void *)a, y);
    (phsubd)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psubd, fuzz) {
  int i, j;
  uint32_t x[4], y[4];
  uint32_t a[4], b[4];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    psubd(a, x, y);
    (psubd)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psubd(a, (void *)a, y);
    (psubd)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(phaddsw, fuzz) {
  int i, j;
  int16_t x[8], y[8];
  int16_t a[8], b[8];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    phaddsw(a, x, y);
    (phaddsw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    phaddsw(a, (void *)a, y);
    (phaddsw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(phsubsw, fuzz) {
  int i, j;
  int16_t x[8], y[8];
  int16_t a[8], b[8];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    phsubsw(a, x, y);
    (phsubsw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    phsubsw(a, (void *)a, y);
    (phsubsw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(phaddw, testOverflow_wrapsAround) {
  short M[2][8] = {
      {0x7fff, 0, 0x7fff, 1, 13004, -30425, 20777, -16389},
      {-28040, 13318, -1336, -24798, -13876, 3599, -7346, -23575},
  };
  phaddw(M[0], M[0], M[1]);
  EXPECT_SHRTMATRIXEQ(2, 8, M, "\n\
 32767 -32768 -17421   4388 -14722 -26134 -10277 -30921\n\
-28040  13318  -1336 -24798 -13876   3599  -7346 -23575");
}

TEST(phaddw, testAliasing_isOk) {
  short M[1][8] = {
      {0, 1, 2, 3, 4, 5, 6, 7},
  };
  phaddw(M[0], M[0], M[0]);
  EXPECT_SHRTMATRIXEQ(1, 8, M, "\n\
 1  5  9 13  1  5  9 13");
}

TEST(phaddsw, testOverflow_saturates) {
  short M[2][8] = {
      {0x7fff, 0, 0x7fff, 1, 0x7fff, 0x7fff, 20777, -16389},
      {-28040, 13318, -1336, -24798, -13876, 3599, -7346, -23575},
  };
  phaddsw(M[0], M[0], M[1]);
  EXPECT_SHRTMATRIXEQ(2, 8, M, "\n\
 32767  32767  32767   4388 -14722 -26134 -10277 -30921\n\
-28040  13318  -1336 -24798 -13876   3599  -7346 -23575");
}

TEST(phaddsw, testAliasing_isOk) {
  short M[1][8] = {{0, 1, 2, 3, 4, 5, 6, 7}};
  phaddsw(M[0], M[0], M[0]);
  EXPECT_SHRTMATRIXEQ(1, 8, M, "\n\
 1  5  9 13  1  5  9 13");
}

TEST(pcmpgtb, test) {
  int i, j;
  int8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    static int count;
    pcmpgtb(a, x, y);
    (pcmpgtb)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pcmpeqb, test) {
  int i, j;
  uint8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pcmpeqb(a, x, y);
    (pcmpeqb)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pcmpeqd, test) {
  int i, j;
  int32_t x[4], y[4], a[4], b[4];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pcmpeqd(a, x, y);
    (pcmpeqd)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pcmpgtd, test) {
  int i, j;
  int32_t x[4], y[4], a[4], b[4];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pcmpgtd(a, x, y);
    (pcmpgtd)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pcmpeqw, test) {
  int i, j;
  int16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pcmpeqw(a, x, y);
    (pcmpeqw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pcmpgtw, test) {
  int i, j;
  int16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pcmpgtw(a, x, y);
    (pcmpgtw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(por, fuzz) {
  int i, j;
  uint64_t x[2], y[2], a[2], b[2];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 2; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando();
    por(a, x, y);
    (por)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    por(a, (void *)a, y);
    (por)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pxor, fuzz) {
  int i, j;
  uint64_t x[2], y[2], a[2], b[2];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 2; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando();
    pxor(a, x, y);
    (pxor)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pxor(a, (void *)a, y);
    (pxor)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pand, fuzz) {
  int i, j;
  uint64_t x[2], y[2], a[2], b[2];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 2; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando();
    pand(a, x, y);
    (pand)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pand(a, (void *)a, y);
    (pand)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pandn, fuzz) {
  int i, j;
  uint64_t x[2], y[2], a[2], b[2];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 2; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando();
    pandn(a, x, y);
    (pandn)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pandn(a, (void *)a, y);
    (pandn)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(paddq, fuzz) {
  int i, j;
  uint64_t x[2], y[2], a[2], b[2];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 2; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando();
    paddq(a, x, y);
    (paddq)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    paddq(a, (void *)a, y);
    (paddq)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pavgb, fuzz) {
  int i, j;
  uint8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pavgb(a, x, y);
    (pavgb)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pavgb(a, (void *)a, y);
    (pavgb)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pavgw, fuzz) {
  int i, j;
  uint16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pavgw(a, x, y);
    (pavgw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pavgw(a, (void *)a, y);
    (pavgw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(punpcklwd, fuzz) {
  int i, j;
  uint16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    punpcklwd(a, x, y);
    (punpcklwd)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    punpcklwd(a, a, y);
    (punpcklwd)(b, b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    punpcklwd(a, y, a);
    (punpcklwd)(b, y, b);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pminub, fuzz) {
  int i, j;
  uint8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pminub(a, x, y);
    (pminub)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pminub(a, (void *)a, y);
    (pminub)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pminsw, fuzz) {
  int i, j;
  int16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pminsw(a, x, y);
    (pminsw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pminsw(a, (void *)a, y);
    (pminsw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pmaxub, fuzz) {
  int i, j;
  uint8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pmaxub(a, x, y);
    (pmaxub)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pmaxub(a, (void *)a, y);
    (pmaxub)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pmaxsw, fuzz) {
  int i, j;
  int16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pmaxsw(a, x, y);
    (pmaxsw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pmaxsw(a, (void *)a, y);
    (pmaxsw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(paddw, test) {
  int16_t A[8] = {7};
  int16_t B[8] = {11};
  int16_t C[8];
  paddw(C, A, B);
  ASSERT_EQ(18, C[0]);
}

TEST(paddw, testOverflow_wrapsAround) {
  int16_t A[8] = {SHRT_MAX, SHRT_MIN};
  int16_t B[8] = {1, -1};
  paddw(A, A, B);
  ASSERT_EQ(SHRT_MIN, A[0]);
  ASSERT_EQ(SHRT_MAX, A[1]);
}

TEST(paddw, fuzz) {
  int i, j;
  int16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    paddw(a, x, y);
    (paddw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    paddw(a, (void *)a, y);
    (paddw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(paddsw, test) {
  int16_t A[8] = {7};
  int16_t B[8] = {11};
  int16_t C[8];
  paddsw(C, A, B);
  ASSERT_EQ(18, C[0]);
}

TEST(paddsw, testOverflow_saturates) {
  int16_t A[8] = {SHRT_MAX, SHRT_MIN};
  int16_t B[8] = {1, -1};
  paddsw(A, A, B);
  ASSERT_EQ(SHRT_MAX, A[0]);
  ASSERT_EQ(SHRT_MIN, A[1]);
}

TEST(paddusw, fuzz) {
  int i, j;
  uint16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    paddusw(a, x, y);
    (paddusw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    paddusw(a, (void *)a, y);
    (paddusw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psubb, fuzz) {
  int i, j;
  uint8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    psubb(a, x, y);
    (psubb)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psubb(a, (void *)a, y);
    (psubb)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psubw, fuzz) {
  int i, j;
  int16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    psubw(a, x, y);
    (psubw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psubw(a, (void *)a, y);
    (psubw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psubusw, fuzz) {
  int i, j;
  uint16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    psubusw(a, x, y);
    (psubusw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psubusw(a, (void *)a, y);
    (psubusw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(paddusb, fuzz) {
  int i, j;
  uint8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    paddusb(a, x, y);
    (paddusb)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    paddusb(a, (void *)a, y);
    (paddusb)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psubusb, fuzz) {
  int i, j;
  uint8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    psubusb(a, x, y);
    (psubusb)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psubusb(a, (void *)a, y);
    (psubusb)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pabsb, fuzz) {
  int i, j;
  int8_t x[16];
  uint8_t a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    pabsb(a, x);
    (pabsb)(b, x);
    ASSERT_EQ(0, memcmp(a, b, 16), "%d\n\t%#.16hhs\n\t%#.16hhs\n\t%#.16hhs", i,
              x, a, b);
  }
}

TEST(pabsw, fuzz) {
  int i, j;
  int16_t x[8];
  uint16_t a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    pabsw(a, x);
    (pabsw)(b, x);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pabsd, fuzz) {
  int i, j;
  int32_t x[4];
  uint32_t a[4], b[4];
  x[0] = INT_MIN;
  pabsd((uint32_t *)x, x);
  EXPECT_EQ(INT_MIN, x[0]);
  x[0] = INT_MIN;
  (pabsd)((uint32_t *)x, x);
  EXPECT_EQ(INT_MIN, x[0]);
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    pabsd(a, x);
    (pabsd)(b, x);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psignb, fuzz) {
  int i, j;
  int8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    psignb(a, x, y);
    (psignb)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psignb(a, (void *)a, y);
    (psignb)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psignw, fuzz) {
  int i, j;
  int16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    psignw(a, x, y);
    (psignw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psignw(a, (void *)a, y);
    (psignw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psignd, fuzz) {
  int i, j;
  int32_t x[4], y[4], a[4], b[4];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    psignd(a, x, y);
    (psignd)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psignd(a, (void *)a, y);
    (psignd)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psignd, testBane) {
  int32_t x[4] = {INT_MIN, INT_MIN, INT_MIN, INT_MIN};
  int32_t y[4] = {0, 1, -1, INT_MIN};
  psignd(x, x, y);
  EXPECT_EQ(0, x[0]);
  EXPECT_EQ(INT_MIN, x[1]);
  EXPECT_EQ(INT_MIN, x[2]);
  EXPECT_EQ(INT_MIN, x[3]);
}

TEST(paddb, fuzz) {
  int i, j;
  int8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    paddb(a, x, y);
    (paddb)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    paddb(a, (void *)a, y);
    (paddb)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(paddsb, fuzz) {
  int i, j;
  int8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    paddsb(a, x, y);
    (paddsb)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    paddsb(a, (void *)a, y);
    (paddsb)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(paddsw, fuzz) {
  int i, j;
  int16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    paddsw(a, x, y);
    (paddsw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    paddsw(a, (void *)a, y);
    (paddsw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psubsb, fuzz) {
  int i, j;
  int8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    psubsb(a, x, y);
    (psubsb)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psubsb(a, (void *)a, y);
    (psubsb)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psubsw, fuzz) {
  int i, j;
  int16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    psubsw(a, x, y);
    (psubsw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psubsw(a, (void *)a, y);
    (psubsw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(paddd, fuzz) {
  int i, j;
  uint32_t x[4], y[4], a[4], b[4];
  RngSet(x, sizeof(x));
  RngSet(y, sizeof(y));
  x[0] = 0x7fffffff;
  y[0] = 0x7fffffff;
  (paddd)(b, x, y);
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    paddd(a, x, y);
    (paddd)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    paddd(a, (void *)a, y);
    (paddd)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pshufb, fuzz) {
  int i, j;
  uint8_t x[16], y[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pshufb(a, x, y);
    (pshufb)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pshufb(a, (void *)a, y);
    (pshufb)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pshufd, fuzz) {
  int i, j;
  int32_t x[4], a[4], b[4];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 4; ++j) x[j] = Rando();
#define T(IMM)                    \
  pshufd(a, x, IMM);              \
  (pshufd)(b, x, IMM);            \
  ASSERT_EQ(0, memcmp(a, b, 16)); \
  pshufd(a, (void *)a, IMM);      \
  (pshufd)(b, (void *)b, IMM);    \
  ASSERT_EQ(0, memcmp(a, b, 16))
    T(0b00000011);
    T(0b00000110);
    T(0b00001100);
    T(0b00011000);
    T(0b00110000);
    T(0b01100000);
    T(0b11000000);
    T(0b10000000);
#undef T
  }
}

TEST(pshuflw, fuzz) {
  int i, j;
  int16_t x[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 8; ++j) x[j] = Rando();
#define T(IMM)                    \
  pshuflw(a, x, IMM);             \
  (pshuflw)(b, x, IMM);           \
  ASSERT_EQ(0, memcmp(a, b, 16)); \
  pshuflw(a, (void *)a, IMM);     \
  (pshuflw)(b, (void *)b, IMM);   \
  ASSERT_EQ(0, memcmp(a, b, 16))
    T(0b00000011);
    T(0b00000110);
    T(0b00001100);
    T(0b00011000);
    T(0b00110000);
    T(0b01100000);
    T(0b11000000);
    T(0b10000000);
#undef T
  }
}

TEST(pshufhw, fuzz) {
  int i, j;
  int16_t x[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 8; ++j) x[j] = Rando();
#define T(IMM)                    \
  pshufhw(a, x, IMM);             \
  (pshufhw)(b, x, IMM);           \
  ASSERT_EQ(0, memcmp(a, b, 16)); \
  pshufhw(a, (void *)a, IMM);     \
  (pshufhw)(b, (void *)b, IMM);   \
  ASSERT_EQ(0, memcmp(a, b, 16))
    T(0b00000011);
    T(0b00000110);
    T(0b00001100);
    T(0b00011000);
    T(0b00110000);
    T(0b01100000);
    T(0b11000000);
    T(0b10000000);
#undef T
  }
}

TEST(packuswb, test) {
  const short S[8] = {0, 128, -128, 255, SHRT_MAX, SHRT_MIN, 0, 0};
  unsigned char B[16] = {0};
  packuswb(B, S, S);
  ASSERT_EQ(0, B[0]);
  ASSERT_EQ(128, B[1]);
  ASSERT_EQ(0, B[2]);
  ASSERT_EQ(255, B[3]);
  ASSERT_EQ(255, B[4]);
  ASSERT_EQ(0, B[5]);
  ASSERT_EQ(0, B[6]);
  ASSERT_EQ(0, B[7]);
  ASSERT_EQ(0, B[8]);
  ASSERT_EQ(128, B[9]);
  ASSERT_EQ(0, B[10]);
  ASSERT_EQ(255, B[11]);
  ASSERT_EQ(255, B[12]);
  ASSERT_EQ(0, B[13]);
  ASSERT_EQ(0, B[14]);
  ASSERT_EQ(0, B[15]);
}

TEST(packsswb, test) {
  const short S[8] = {0, 128, -128, 255, SHRT_MAX, SHRT_MIN, 0, 0};
  signed char B[16] = {0};
  packsswb(B, S, S);
  ASSERT_EQ(0, B[0]);
  ASSERT_EQ(127, B[1]);
  ASSERT_EQ(-128, B[2]);
  ASSERT_EQ(127, B[3]);
  ASSERT_EQ(127, B[4]);
  ASSERT_EQ(-128, B[5]);
  ASSERT_EQ(0, B[6]);
  ASSERT_EQ(0, B[7]);
  ASSERT_EQ(0, B[8]);
  ASSERT_EQ(127, B[9]);
  ASSERT_EQ(-128, B[10]);
  ASSERT_EQ(127, B[11]);
  ASSERT_EQ(127, B[12]);
  ASSERT_EQ(-128, B[13]);
  ASSERT_EQ(0, B[14]);
  ASSERT_EQ(0, B[15]);
}

TEST(packssdw, testAlias) {
  int i, j;
  union {
    int16_t out[8];
    int32_t in1[4];
  } u;
  int16_t a[8], b[8];
  int32_t x[4], y[4];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    memcpy(u.in1, x, sizeof(x));
    packssdw(u.out, u.in1, y);
    memcpy(a, u.out, sizeof(u.out));
    memcpy(u.in1, x, sizeof(x));
    (packssdw)(u.out, u.in1, y);
    memcpy(b, u.out, sizeof(u.out));
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(packusdw, test) {
  int i, j;
  int32_t x[4], y[4];
  uint16_t a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    packusdw(a, x, y);
    (packusdw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(packuswb, fuzz) {
  int i, j;
  int16_t x[8], y[8];
  uint8_t a[16], b[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    packuswb(a, x, y);
    (packuswb)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    packuswb(a, x, x);
    (packuswb)(b, x, x);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(packssdw, test) {
  int i, j;
  int32_t x[4], y[4];
  int16_t a[8], b[8];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    packssdw(a, x, y);
    (packssdw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psllwv, fuzz) {
  int i, j;
  uint64_t y[2];
  uint16_t x[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 8; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando() % 64;
    psllwv(a, x, y);
    (psllwv)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psllwv(a, (void *)a, y);
    (psllwv)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pslldv, fuzz) {
  int i, j;
  uint64_t y[2];
  uint32_t x[4], a[4], b[4];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 4; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando() % 64;
    pslldv(a, x, y);
    (pslldv)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pslldv(a, (void *)a, y);
    (pslldv)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psllqv, fuzz) {
  int i, j;
  uint64_t y[2];
  uint64_t x[2], a[2], b[2];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 2; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando() % 64;
    psllqv(a, x, y);
    (psllqv)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psllqv(a, (void *)a, y);
    (psllqv)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psrlwv, fuzz) {
  int i, j;
  uint64_t y[2];
  uint16_t x[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 8; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando() % 64;
    psrlwv(a, x, y);
    (psrlwv)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psrlwv(a, (void *)a, y);
    (psrlwv)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psrldv, fuzz) {
  int i, j;
  uint64_t y[2];
  uint32_t x[4], a[4], b[4];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 4; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando() % 64;
    psrldv(a, x, y);
    (psrldv)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psrldv(a, (void *)a, y);
    (psrldv)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psrlqv, fuzz) {
  int i, j;
  uint64_t y[2];
  uint64_t x[2], a[2], b[2];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 2; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando() % 64;
    psrlqv(a, x, y);
    (psrlqv)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psrlqv(a, (void *)a, y);
    (psrlqv)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psrawv, fuzz) {
  int i, j;
  uint64_t y[2];
  int16_t x[8], a[8], b[8];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 8; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando() % 64;
    psrawv(a, x, y);
    (psrawv)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psrawv(a, (void *)a, y);
    (psrawv)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(psradv, fuzz) {
  int i, j;
  uint64_t y[2];
  int32_t x[4], a[4], b[4];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 4; ++j) x[j] = Rando();
    for (j = 0; j < 2; ++j) y[j] = Rando() % 64;
    psradv(a, x, y);
    (psradv)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    psradv(a, (void *)a, y);
    (psradv)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

// // TODO(jart): Fix me. on low power cpus.
// TEST(psrldq, fuzz) {
//   int i, n;
//   uint8_t x[16], a[16], b[16];
//   for (i = 0; i < 100; ++i) {
//     memset(a, -1, sizeof(a));
//     memset(b, -1, sizeof(b));
//     RngSet(x, sizeof(x));
//     n = Rando() % 20;
//     psrldq(a, x, n);
//     (psrldq)(b, x, n);
//     ASSERT_EQ(0, memcmp(a, b, 16), "%d\n\t%#.16hhs\n\t%#.16hhs\n\t%#.16hhs",
//               n, x, a, b);
//     n = Rando() % 20;
//     psrldq(a, a, n);
//     (psrldq)(b, b, n);
//     ASSERT_EQ(0, memcmp(a, b, 16), "%d\n\t%#.16hhs\n\t%#.16hhs\n\t%#.16hhs",
//               n, x, a, b);
//   }
// }

TEST(pslldq, fuzz) {
  int i, n;
  uint8_t x[16], a[16], b[16];
  for (i = 0; i < 100; ++i) {
    memset(a, -1, sizeof(a));
    memset(b, -1, sizeof(b));
    RngSet(x, sizeof(x));
    n = Rando() % 20;
    pslldq(a, x, n);
    (pslldq)(b, x, n);
    ASSERT_EQ(0, memcmp(a, b, 16), "%d\n\t%#.16hhs\n\t%#.16hhs\n\t%#.16hhs", n,
              x, a, b);
    n = Rando() % 20;
    pslldq(a, a, n);
    (pslldq)(b, b, n);
    ASSERT_EQ(0, memcmp(a, b, 16), "%d\n\t%#.16hhs\n\t%#.16hhs\n\t%#.16hhs", n,
              x, a, b);
  }
}

TEST(psadbw, test) {
  int i, j;
  uint64_t a[2], b[2];
  uint8_t x[16], y[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    psadbw(a, x, y);
    (psadbw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pmulhrsw, fuzz) {
  int i, j;
  int16_t x[8], y[8], a[8], b[8];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pmulhrsw(a, x, y);
    (pmulhrsw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pmulhrsw(a, (void *)a, y);
    (pmulhrsw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(mpsadbw, fuzz) {
  int i, j;
  uint16_t a[8], b[8];
  uint8_t x[16], y[16];
  for (i = 0; i < 100; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    for (j = 0; j < 8; ++j) {
      mpsadbw(a, x, y, j);
      (mpsadbw)(b, x, y, j);
      ASSERT_EQ(0, memcmp(a, b, 16), "%d %d", i, j);
    }
  }
}

TEST(pmaddubsw, fuzz) {
  int i, j;
  int8_t y[16];
  uint8_t x[16];
  int16_t a[8], b[8];
  for (i = 0; i < 1000; ++i) {
    RngSet(x, sizeof(x));
    RngSet(y, sizeof(y));
    pmaddubsw(a, x, y);
    (pmaddubsw)(b, x, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
    pmaddubsw(a, (void *)a, y);
    (pmaddubsw)(b, (void *)b, y);
    ASSERT_EQ(0, memcmp(a, b, 16));
  }
}

TEST(pshufw, fuzz) {
  int i, j;
  uint8_t y;
  int16_t x[4], a[4], b[4];
  for (i = 0; i < 100; ++i) {
    for (j = 0; j < 4; ++j) x[j] = Rando();
    pshufw(a, x, 0b10111111);
    (pshufw)(b, x, 0b10111111);
    ASSERT_EQ(0, memcmp(a, b, 8));
    pshufw(a, (void *)a, 0b10111111);
    (pshufw)(b, (void *)b, 0b10111111);
    ASSERT_EQ(0, memcmp(a, b, 8));
    pshufw(a, x, 0b00001000);
    (pshufw)(b, x, 0b00001000);
    ASSERT_EQ(0, memcmp(a, b, 8));
    pshufw(a, x, 0b00010001);
    (pshufw)(b, x, 0b00010001);
    ASSERT_EQ(0, memcmp(a, b, 8));
    pshufw(a, x, 0b01110100);
    (pshufw)(b, x, 0b01110100);
    ASSERT_EQ(0, memcmp(a, b, 8));
    pshufw(a, x, 0b01101101);
    (pshufw)(b, x, 0b01101101);
    ASSERT_EQ(0, memcmp(a, b, 8));
    pshufw(a, x, 0b10011011);
    (pshufw)(b, x, 0b10011011);
    ASSERT_EQ(0, memcmp(a, b, 8));
    pshufw(a, x, 0b10111000);
    (pshufw)(b, x, 0b10111000);
    ASSERT_EQ(0, memcmp(a, b, 8));
    pshufw(a, x, 0b11000111);
    (pshufw)(b, x, 0b11000111);
    ASSERT_EQ(0, memcmp(a, b, 8));
  }
}

TEST(pcmpeqw, test2) {
  int16_t kNumbers16[] = {0, 1, 2, 123, 0xffff, 0xfffe, 0x8000, 0x8001, 0x8080};
  int i, j, k;
  int16_t a[8], b[8], x[8], y[8];
  for (i = 0; i < ARRAYLEN(kNumbers16); ++i) {
    for (j = 0; j < ARRAYLEN(kNumbers16); ++j) {
      for (k = 0; k < 8; ++k) {
        x[k] = kNumbers16[(i + k) % ARRAYLEN(kNumbers16)];
        y[k] = kNumbers16[(j + k) % ARRAYLEN(kNumbers16)];
      }
      pcmpeqw(a, x, y);
      (pcmpeqw)(b, x, y);
      EXPECT_EQ(0, memcmp(a, b, 16));
    }
  }
}

TEST(pdep, fuzz) {
  int i;
  uint64_t x, y;
  for (i = 0; i < 1000; ++i) {
    x = rand64();
    y = rand64();
    ASSERT_EQ(pdep(x, y), (pdep)(x, y));
  }
}

TEST(pext, fuzz) {
  int i;
  uint64_t x, y;
  for (i = 0; i < 1000; ++i) {
    x = rand64();
    y = rand64();
    ASSERT_EQ(pext(x, y), (pext)(x, y));
  }
}