diff --git a/Makefile b/Makefile index 7f0eca1ae..7ffd17bca 100644 --- a/Makefile +++ b/Makefile @@ -143,6 +143,7 @@ include libc/time/time.mk # │ include libc/stdio/stdio.mk # │ include third_party/libcxx/libcxx.mk # │ include net/net.mk # │ +include third_party/vqsort/vqsort.mk # │ include libc/log/log.mk # │ include third_party/bzip2/bzip2.mk # │ include dsp/core/core.mk # │ diff --git a/examples/examples.mk b/examples/examples.mk index ab28ad6b7..496403a90 100644 --- a/examples/examples.mk +++ b/examples/examples.mk @@ -89,6 +89,7 @@ EXAMPLES_DIRECTDEPS = \ THIRD_PARTY_SED \ THIRD_PARTY_STB \ THIRD_PARTY_TR \ + THIRD_PARTY_VQSORT \ THIRD_PARTY_XED \ THIRD_PARTY_ZLIB \ TOOL_BUILD_LIB \ diff --git a/examples/vqsort.c b/examples/vqsort.c new file mode 100644 index 000000000..06371b190 --- /dev/null +++ b/examples/vqsort.c @@ -0,0 +1,25 @@ +#if 0 +/*─────────────────────────────────────────────────────────────────╗ +│ To the extent possible under law, Justine Tunney has waived │ +│ all copyright and related or neighboring rights to this file, │ +│ as it is written in the following disclaimers: │ +│ • http://unlicense.org/ │ +│ • http://creativecommons.org/publicdomain/zero/1.0/ │ +╚─────────────────────────────────────────────────────────────────*/ +#endif +#include "third_party/vqsort/vqsort.h" +#include "libc/macros.internal.h" +#include "libc/stdio/stdio.h" +#include "third_party/vqsort/vqsort.h" + +// how to sort one gigabyte of 64-bit integers per second + +int main(int argc, char *argv[]) { + int64_t A[] = {9, 3, -3, 5, 23, 7}; + vqsort_int64(A, ARRAYLEN(A)); + for (int i = 0; i < ARRAYLEN(A); ++i) { + if (i) printf(" "); + printf("%ld", A[i]); + } + printf("\n"); +} diff --git a/libc/mem/alg.h b/libc/mem/alg.h index 352ffb80c..5208fc35f 100644 --- a/libc/mem/alg.h +++ b/libc/mem/alg.h @@ -34,6 +34,9 @@ int _tarjan(int, const int (*)[2], int, int[], int[], int *) char *_replacestr(const char *, const char *, const char *) paramsnonnull() __algalloc; +bool radix_sort_int32(int32_t *, size_t); +bool radix_sort_int64(int64_t *, size_t); + COSMOPOLITAN_C_END_ #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ #endif /* COSMOPOLITAN_LIBC_ALG_ALG_H_ */ diff --git a/libc/mem/radix_sort_int32.c b/libc/mem/radix_sort_int32.c new file mode 100644 index 000000000..21bcf3787 --- /dev/null +++ b/libc/mem/radix_sort_int32.c @@ -0,0 +1,101 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2023 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/mem/alg.h" +#include "libc/mem/mem.h" +#include "libc/runtime/runtime.h" +#include "libc/str/str.h" + +// Credit: Andrew Schein. 2009. Open-source C++ implementation of Radix +// Sort for double-precision floating points. (2009). + +#define HIST_SIZE (size_t)2048 +#define get_byte_0(v) ((v)&0x7FF) +#define get_byte_1(v) (((v) >> 11) & 0x7FF) +#define get_byte_2_flip_sign(v) (((unsigned)(v) >> 22) ^ 0x200) + +bool radix_sort_int32(int32_t *A, size_t n) { + int32_t *T, *reader, *writer; + size_t i, pos, sum0, sum1, sum2, tsum, *b0, *b1, *b2; + + if (n < HIST_SIZE) { + _intsort(A, n); + return true; + } + + if (!(T = (int32_t *)malloc(n * sizeof(int32_t)))) { + return false; + } + + if (!(b0 = (size_t *)calloc(HIST_SIZE * 3, sizeof(size_t)))) { + free(T); + return false; + } + + b1 = b0 + HIST_SIZE; + b2 = b1 + HIST_SIZE; + + for (i = 0; i < n; i++) { + b0[get_byte_0(A[i])]++; + b1[get_byte_1(A[i])]++; + b2[get_byte_2_flip_sign(A[i])]++; + } + + sum0 = sum1 = sum2 = tsum = 0; + + for (i = 0; i < HIST_SIZE; i++) { + tsum = b0[i] + sum0; + b0[i] = sum0 - 1; + sum0 = tsum; + + tsum = b1[i] + sum1; + b1[i] = sum1 - 1; + sum1 = tsum; + + tsum = b2[i] + sum2; + b2[i] = sum2 - 1; + sum2 = tsum; + } + + writer = T; + reader = A; + for (i = 0; i < n; i++) { + pos = get_byte_0(reader[i]); + writer[++b0[pos]] = reader[i]; + } + + writer = A; + reader = T; + for (i = 0; i < n; i++) { + pos = get_byte_1(reader[i]); + writer[++b1[pos]] = reader[i]; + } + + writer = T; + reader = A; + for (i = 0; i < n; i++) { + pos = get_byte_2_flip_sign(reader[i]); + writer[++b2[pos]] = reader[i]; + } + + memcpy(A, T, n * sizeof(int)); + + free(b0); + free(T); + return true; +} diff --git a/libc/mem/radix_sort_int64.c b/libc/mem/radix_sort_int64.c new file mode 100644 index 000000000..4f1aaf1c0 --- /dev/null +++ b/libc/mem/radix_sort_int64.c @@ -0,0 +1,144 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2023 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/mem/alg.h" +#include "libc/mem/mem.h" +#include "libc/runtime/runtime.h" +#include "libc/str/str.h" + +// Credit: Andrew Schein. 2009. Open-source C++ implementation of Radix +// Sort for double-precision floating points. (2009). + +#define HIST_SIZE (size_t)2048 +#define get_byte_0(v) ((v)&0x7FF) +#define get_byte_1(v) (((v) >> 11) & 0x7FF) +#define get_byte_2(v) (((v) >> 22) & 0x7FF) +#define get_byte_3(v) (((v) >> 33) & 0x7FF) +#define get_byte_4(v) (((v) >> 44) & 0x7FF) +#define get_byte_5(v) (((v) >> 55) & 0x7FF) +#define get_byte_2_flip_sign(v) (((unsigned)(v) >> 22) ^ 0x200) +#define get_byte_5_flip_sign(v) ((((v) >> 55) & 0x7FF) ^ 0x400) + +bool radix_sort_int64(int64_t *A, size_t n) { + int64_t *T, *reader, *writer; + size_t *b0, *b1, *b2, *b3, *b4, *b5; + size_t i, pos, sum0, sum1, sum2, sum3, sum4, sum5, tsum; + + if (n < HIST_SIZE) { + _longsort(A, n); + return true; + } + + if (!(T = (int64_t *)malloc(n * sizeof(int64_t)))) { + return false; + } + + if (!(b0 = (size_t *)calloc(HIST_SIZE * 6, sizeof(size_t)))) { + free(T); + return false; + } + + b1 = b0 + HIST_SIZE; + b2 = b1 + HIST_SIZE; + b3 = b2 + HIST_SIZE; + b4 = b3 + HIST_SIZE; + b5 = b4 + HIST_SIZE; + + for (i = 0; i < n; i++) { + b0[get_byte_0(A[i])]++; + b1[get_byte_1(A[i])]++; + b2[get_byte_2(A[i])]++; + b3[get_byte_3(A[i])]++; + b4[get_byte_4(A[i])]++; + b5[get_byte_5_flip_sign(A[i])]++; + } + + sum0 = sum1 = sum2 = sum3 = sum4 = sum5 = tsum = 0; + + for (i = 0; i < HIST_SIZE; i++) { + tsum = b0[i] + sum0; + b0[i] = sum0 - 1; + sum0 = tsum; + + tsum = b1[i] + sum1; + b1[i] = sum1 - 1; + sum1 = tsum; + + tsum = b2[i] + sum2; + b2[i] = sum2 - 1; + sum2 = tsum; + + tsum = b3[i] + sum3; + b3[i] = sum3 - 1; + sum3 = tsum; + + tsum = b4[i] + sum4; + b4[i] = sum4 - 1; + sum4 = tsum; + + tsum = b5[i] + sum5; + b5[i] = sum5 - 1; + sum5 = tsum; + } + + writer = T; + reader = A; + for (i = 0; i < n; i++) { + pos = get_byte_0(reader[i]); + writer[++b0[pos]] = reader[i]; + } + + writer = A; + reader = T; + for (i = 0; i < n; i++) { + pos = get_byte_1(reader[i]); + writer[++b1[pos]] = reader[i]; + } + + writer = T; + reader = A; + for (i = 0; i < n; i++) { + pos = get_byte_2(reader[i]); + writer[++b2[pos]] = reader[i]; + } + + writer = A; + reader = T; + for (i = 0; i < n; i++) { + pos = get_byte_3(reader[i]); + writer[++b3[pos]] = reader[i]; + } + + writer = T; + reader = A; + for (i = 0; i < n; i++) { + pos = get_byte_4(reader[i]); + writer[++b4[pos]] = reader[i]; + } + + writer = A; + reader = T; + for (i = 0; i < n; i++) { + pos = get_byte_5_flip_sign(reader[i]); + writer[++b5[pos]] = reader[i]; + } + + free(b0); + free(T); + return true; +} diff --git a/test/libc/str/longsort_test.c b/test/libc/str/longsort_test.c index 59beb0723..cf63f780d 100644 --- a/test/libc/str/longsort_test.c +++ b/test/libc/str/longsort_test.c @@ -21,9 +21,11 @@ #include "libc/mem/mem.h" #include "libc/runtime/runtime.h" #include "libc/stdio/rand.h" +#include "libc/stdio/stdio.h" #include "libc/str/str.h" #include "libc/testlib/ezbench.h" #include "libc/testlib/testlib.h" +#include "third_party/vqsort/vqsort.h" int CompareLong(const void *a, const void *b) { const long *x = a; @@ -44,13 +46,88 @@ TEST(_longsort, test) { ASSERT_EQ(0, memcmp(b, a, n * sizeof(long))); } +TEST(vqsort_int64_avx2, test) { + if (!X86_HAVE(AVX2)) return; + size_t n = 5000; + long *a = gc(calloc(n, sizeof(long))); + long *b = gc(calloc(n, sizeof(long))); + rngset(a, n * sizeof(long), 0, 0); + memcpy(b, a, n * sizeof(long)); + qsort(a, n, sizeof(long), CompareLong); + vqsort_int64_avx2(b, n); + ASSERT_EQ(0, memcmp(b, a, n * sizeof(long))); +} + +TEST(vqsort_int64_sse4, test) { + if (!X86_HAVE(SSE4_2)) return; + size_t n = 5000; + long *a = gc(calloc(n, sizeof(long))); + long *b = gc(calloc(n, sizeof(long))); + rngset(a, n * sizeof(long), 0, 0); + memcpy(b, a, n * sizeof(long)); + qsort(a, n, sizeof(long), CompareLong); + vqsort_int64_sse4(b, n); + ASSERT_EQ(0, memcmp(b, a, n * sizeof(long))); +} + +TEST(vqsort_int64_ssse3, test) { + if (!X86_HAVE(SSSE3)) return; + size_t n = 5000; + long *a = gc(calloc(n, sizeof(long))); + long *b = gc(calloc(n, sizeof(long))); + rngset(a, n * sizeof(long), 0, 0); + memcpy(b, a, n * sizeof(long)); + qsort(a, n, sizeof(long), CompareLong); + vqsort_int64_ssse3(b, n); + ASSERT_EQ(0, memcmp(b, a, n * sizeof(long))); +} + +TEST(vqsort_int64_sse2, test) { + size_t n = 5000; + long *a = gc(calloc(n, sizeof(long))); + long *b = gc(calloc(n, sizeof(long))); + rngset(a, n * sizeof(long), 0, 0); + memcpy(b, a, n * sizeof(long)); + qsort(a, n, sizeof(long), CompareLong); + vqsort_int64_sse2(b, n); + ASSERT_EQ(0, memcmp(b, a, n * sizeof(long))); +} + +TEST(radix_sort_int64, test) { + size_t n = 5000; + long *a = gc(calloc(n, sizeof(long))); + long *b = gc(calloc(n, sizeof(long))); + rngset(a, n * sizeof(long), 0, 0); + memcpy(b, a, n * sizeof(long)); + qsort(a, n, sizeof(long), CompareLong); + radix_sort_int64(b, n); + ASSERT_EQ(0, memcmp(b, a, n * sizeof(long))); +} + BENCH(_longsort, bench) { - size_t n = 1000; + printf("\n"); + size_t n = 5000; long *p1 = gc(malloc(n * sizeof(long))); long *p2 = gc(malloc(n * sizeof(long))); rngset(p1, n * sizeof(long), 0, 0); EZBENCH2("_longsort", memcpy(p2, p1, n * sizeof(long)), _longsort(p2, n)); - EZBENCH2("qsort", memcpy(p2, p1, n * sizeof(long)), + if (X86_HAVE(AVX2)) { + EZBENCH2("vqsort_int64_avx2", memcpy(p2, p1, n * sizeof(long)), + vqsort_int64_avx2(p2, n)); + } + if (X86_HAVE(SSE4_2)) { + EZBENCH2("vqsort_int64_sse4", memcpy(p2, p1, n * sizeof(long)), + vqsort_int64_sse4(p2, n)); + } + if (X86_HAVE(SSSE3)) { + EZBENCH2("vqsort_int64_ssse3", memcpy(p2, p1, n * sizeof(long)), + vqsort_int64_ssse3(p2, n)); + } + EZBENCH2("vqsort_int64_sse2", memcpy(p2, p1, n * sizeof(long)), + vqsort_int64_sse2(p2, n)); + EZBENCH2("radix_sort_int64", memcpy(p2, p1, n * sizeof(long)), + radix_sort_int64(p2, n)); + EZBENCH2("qsort(long)", memcpy(p2, p1, n * sizeof(long)), qsort(p2, n, sizeof(long), CompareLong)); } @@ -73,12 +150,88 @@ TEST(_intsort, test) { ASSERT_EQ(0, memcmp(b, a, n * sizeof(int))); } +TEST(vqsort_int32_avx2, test) { + if (!X86_HAVE(AVX2)) return; + size_t n = 5000; + int *a = gc(calloc(n, sizeof(int))); + int *b = gc(calloc(n, sizeof(int))); + rngset(a, n * sizeof(int), 0, 0); + memcpy(b, a, n * sizeof(int)); + qsort(a, n, sizeof(int), CompareInt); + vqsort_int32_avx2(b, n); + ASSERT_EQ(0, memcmp(b, a, n * sizeof(int))); +} + +TEST(vqsort_int32_sse4, test) { + if (!X86_HAVE(SSE4_2)) return; + size_t n = 5000; + int *a = gc(calloc(n, sizeof(int))); + int *b = gc(calloc(n, sizeof(int))); + rngset(a, n * sizeof(int), 0, 0); + memcpy(b, a, n * sizeof(int)); + qsort(a, n, sizeof(int), CompareInt); + vqsort_int32_sse4(b, n); + ASSERT_EQ(0, memcmp(b, a, n * sizeof(int))); +} + +TEST(vqsort_int32_ssse3, test) { + if (!X86_HAVE(SSSE3)) return; + size_t n = 5000; + int *a = gc(calloc(n, sizeof(int))); + int *b = gc(calloc(n, sizeof(int))); + rngset(a, n * sizeof(int), 0, 0); + memcpy(b, a, n * sizeof(int)); + qsort(a, n, sizeof(int), CompareInt); + vqsort_int32_ssse3(b, n); + ASSERT_EQ(0, memcmp(b, a, n * sizeof(int))); +} + +TEST(vqsort_int32_sse2, test) { + size_t n = 5000; + int *a = gc(calloc(n, sizeof(int))); + int *b = gc(calloc(n, sizeof(int))); + rngset(a, n * sizeof(int), 0, 0); + memcpy(b, a, n * sizeof(int)); + qsort(a, n, sizeof(int), CompareInt); + vqsort_int32_sse2(b, n); + ASSERT_EQ(0, memcmp(b, a, n * sizeof(int))); +} + +TEST(radix_sort_int32, test) { + size_t n = 5000; + int *a = gc(calloc(n, sizeof(int))); + int *b = gc(calloc(n, sizeof(int))); + rngset(a, n * sizeof(int), 0, 0); + memcpy(b, a, n * sizeof(int)); + qsort(a, n, sizeof(int), CompareInt); + radix_sort_int32(b, n); + ASSERT_EQ(0, memcmp(b, a, n * sizeof(int))); +} + BENCH(_intsort, bench) { - size_t n = 1000; + printf("\n"); + size_t n = 10000; int *p1 = gc(malloc(n * sizeof(int))); int *p2 = gc(malloc(n * sizeof(int))); rngset(p1, n * sizeof(int), 0, 0); EZBENCH2("_intsort", memcpy(p2, p1, n * sizeof(int)), _intsort(p2, n)); - EZBENCH2("qsort", memcpy(p2, p1, n * sizeof(int)), + if (X86_HAVE(AVX2)) { + EZBENCH2("vqsort_int32_avx2", memcpy(p2, p1, n * sizeof(int)), + vqsort_int32_avx2(p2, n)); + } + if (X86_HAVE(SSE4_2)) { + EZBENCH2("vqsort_int32_sse4", memcpy(p2, p1, n * sizeof(int)), + vqsort_int32_sse4(p2, n)); + } + if (X86_HAVE(SSSE3)) { + EZBENCH2("vqsort_int32_ssse3", memcpy(p2, p1, n * sizeof(int)), + vqsort_int32_ssse3(p2, n)); + } + EZBENCH2("vqsort_int32_sse2", memcpy(p2, p1, n * sizeof(int)), + vqsort_int32_sse2(p2, n)); + EZBENCH2("djbsort", memcpy(p2, p1, n * sizeof(int)), djbsort(p2, n)); + EZBENCH2("radix_sort_int32", memcpy(p2, p1, n * sizeof(int)), + radix_sort_int32(p2, n)); + EZBENCH2("qsort(int)", memcpy(p2, p1, n * sizeof(int)), qsort(p2, n, sizeof(int), CompareInt)); } diff --git a/test/libc/str/test.mk b/test/libc/str/test.mk index 73ff4c7b0..68ab5eb82 100644 --- a/test/libc/str/test.mk +++ b/test/libc/str/test.mk @@ -51,7 +51,8 @@ TEST_LIBC_STR_DIRECTDEPS = \ THIRD_PARTY_REGEX \ THIRD_PARTY_ZLIB \ THIRD_PARTY_LIBCXX \ - THIRD_PARTY_SMALLZ4 + THIRD_PARTY_SMALLZ4 \ + THIRD_PARTY_VQSORT TEST_LIBC_STR_DEPS := \ $(call uniq,$(foreach x,$(TEST_LIBC_STR_DIRECTDEPS),$($(x)))) diff --git a/third_party/compiler_rt/popcountdi2.c b/third_party/compiler_rt/popcountdi2.c index 9ce66b277..ae21c6c47 100644 --- a/third_party/compiler_rt/popcountdi2.c +++ b/third_party/compiler_rt/popcountdi2.c @@ -22,6 +22,10 @@ STATIC_YOINK("huge_compiler_rt_license"); COMPILER_RT_ABI si_int __popcountdi2(di_int a) { +#ifdef __POPCNT__ + asm("popcnt\t%1,%0" : "=r"(a) : "r"(a) : "cc"); + return a; +#else du_int x2 = (du_int)a; x2 = x2 - ((x2 >> 1) & 0x5555555555555555uLL); /* Every 2 bits holds the sum of every pair of bits (32) */ @@ -36,4 +40,5 @@ __popcountdi2(di_int a) /* The lower 16 bits hold two 32 bit sums (6 significant bits). */ /* Upper 16 bits are garbage */ return (x + (x >> 8)) & 0x0000007F; /* (7 significant bits) */ +#endif } diff --git a/third_party/third_party.mk b/third_party/third_party.mk index 570d1715f..ffcd1e863 100644 --- a/third_party/third_party.mk +++ b/third_party/third_party.mk @@ -34,6 +34,7 @@ o/$(MODE)/third_party: \ o/$(MODE)/third_party/tidy \ o/$(MODE)/third_party/tr \ o/$(MODE)/third_party/unzip \ + o/$(MODE)/third_party/vqsort \ o/$(MODE)/third_party/xed \ o/$(MODE)/third_party/zip \ o/$(MODE)/third_party/zlib diff --git a/third_party/vqsort/README.cosmo b/third_party/vqsort/README.cosmo new file mode 100644 index 000000000..677192be9 --- /dev/null +++ b/third_party/vqsort/README.cosmo @@ -0,0 +1,23 @@ +DESCRIPTION + + vqsort implements vectorized quicksort using avx2. this is the fastest + way to sort integers. this goes as fast as djbsort for 32-bit integers + except it supports 64-bit integers too, which go just as fast: about a + gigabyte of memory sorted per second. It's 3x faster than simple radix + sort. It's 5x faster than simple quicksort. It's 10x faster than qsort + +LICENSE + + Apache 2.o + +ORIGIN + + https://github.com/google/highway/ + commit 50331e0523bbf5f6c94b94263a91680f118e0986 + Author: Jan Wassenberg + Date: Wed Apr 26 11:20:33 2023 -0700 + Faster vqsort for small arrays (7x speedup! for N=100) + +LOCAL CHANGES + + Precompiled beacuse upstream codebase is slow, gigantic, and hairy. diff --git a/third_party/vqsort/vqsort.h b/third_party/vqsort/vqsort.h new file mode 100644 index 000000000..e32517f5c --- /dev/null +++ b/third_party/vqsort/vqsort.h @@ -0,0 +1,20 @@ +#ifndef COSMOPOLITAN_THIRD_PARTY_VQSORT_H_ +#define COSMOPOLITAN_THIRD_PARTY_VQSORT_H_ +#if !(__ASSEMBLER__ + __LINKER__ + 0) +COSMOPOLITAN_C_START_ + +void vqsort_int64(int64_t *, size_t); +void vqsort_int64_avx2(int64_t *, size_t); +void vqsort_int64_sse4(int64_t *, size_t); +void vqsort_int64_ssse3(int64_t *, size_t); +void vqsort_int64_sse2(int64_t *, size_t); + +void vqsort_int32(int32_t *, size_t); +void vqsort_int32_avx2(int32_t *, size_t); +void vqsort_int32_sse4(int32_t *, size_t); +void vqsort_int32_ssse3(int32_t *, size_t); +void vqsort_int32_sse2(int32_t *, size_t); + +COSMOPOLITAN_C_END_ +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* COSMOPOLITAN_THIRD_PARTY_VQSORT_H_ */ diff --git a/third_party/vqsort/vqsort.mk b/third_party/vqsort/vqsort.mk new file mode 100644 index 000000000..ca1f785ff --- /dev/null +++ b/third_party/vqsort/vqsort.mk @@ -0,0 +1,52 @@ +#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐ +#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘ + +PKGS += THIRD_PARTY_VQSORT + +THIRD_PARTY_VQSORT_ARTIFACTS += THIRD_PARTY_VQSORT_A +THIRD_PARTY_VQSORT = $(THIRD_PARTY_VQSORT_A_DEPS) $(THIRD_PARTY_VQSORT_A) +THIRD_PARTY_VQSORT_A = o/$(MODE)/third_party/vqsort/vqsort.a +THIRD_PARTY_VQSORT_A_FILES := $(wildcard third_party/vqsort/*) +THIRD_PARTY_VQSORT_A_HDRS = $(filter %.h,$(THIRD_PARTY_VQSORT_A_FILES)) +THIRD_PARTY_VQSORT_A_SRCS_C = $(filter %.c,$(THIRD_PARTY_VQSORT_A_FILES)) +THIRD_PARTY_VQSORT_A_SRCS_S = $(filter %.S,$(THIRD_PARTY_VQSORT_A_FILES)) +THIRD_PARTY_VQSORT_A_SRCS = $(THIRD_PARTY_VQSORT_A_SRCS_C) $(THIRD_PARTY_VQSORT_A_SRCS_S) +THIRD_PARTY_VQSORT_A_OBJS_C = $(THIRD_PARTY_VQSORT_A_SRCS_C:%.c=o/$(MODE)/%.o) +THIRD_PARTY_VQSORT_A_OBJS_S = $(THIRD_PARTY_VQSORT_A_SRCS_S:%.S=o/$(MODE)/%.o) +THIRD_PARTY_VQSORT_A_OBJS = $(THIRD_PARTY_VQSORT_A_OBJS_C) $(THIRD_PARTY_VQSORT_A_OBJS_S) + +THIRD_PARTY_VQSORT_A_CHECKS = \ + $(THIRD_PARTY_VQSORT_A).pkg \ + $(THIRD_PARTY_VQSORT_A_HDRS:%=o/$(MODE)/%.ok) + +THIRD_PARTY_VQSORT_A_DIRECTDEPS = \ + LIBC_INTRIN \ + LIBC_MEM \ + LIBC_NEXGEN32E \ + LIBC_RUNTIME \ + LIBC_STDIO \ + LIBC_STR \ + LIBC_STUBS \ + THIRD_PARTY_COMPILER_RT + +THIRD_PARTY_VQSORT_A_DEPS := \ + $(call uniq,$(foreach x,$(THIRD_PARTY_VQSORT_A_DIRECTDEPS),$($(x)))) + +$(THIRD_PARTY_VQSORT_A): \ + third_party/vqsort/ \ + $(THIRD_PARTY_VQSORT_A).pkg \ + $(THIRD_PARTY_VQSORT_A_OBJS) + +$(THIRD_PARTY_VQSORT_A).pkg: \ + $(THIRD_PARTY_VQSORT_A_OBJS) \ + $(foreach x,$(THIRD_PARTY_VQSORT_A_DIRECTDEPS),$($(x)_A).pkg) + +THIRD_PARTY_VQSORT_LIBS = $(foreach x,$(THIRD_PARTY_VQSORT_ARTIFACTS),$($(x))) +THIRD_PARTY_VQSORT_SRCS = $(foreach x,$(THIRD_PARTY_VQSORT_ARTIFACTS),$($(x)_SRCS)) +THIRD_PARTY_VQSORT_HDRS = $(foreach x,$(THIRD_PARTY_VQSORT_ARTIFACTS),$($(x)_HDRS)) +THIRD_PARTY_VQSORT_CHECKS = $(foreach x,$(THIRD_PARTY_VQSORT_ARTIFACTS),$($(x)_CHECKS)) +THIRD_PARTY_VQSORT_OBJS = $(foreach x,$(THIRD_PARTY_VQSORT_ARTIFACTS),$($(x)_OBJS)) +$(THIRD_PARTY_VQSORT_OBJS): $(BUILD_FILES) third_party/vqsort/vqsort.mk + +.PHONY: o/$(MODE)/third_party/vqsort +o/$(MODE)/third_party/vqsort: $(THIRD_PARTY_VQSORT_CHECKS) diff --git a/third_party/vqsort/vqsort_i32a.S b/third_party/vqsort/vqsort_i32a.S new file mode 100644 index 000000000..24b3b5d2f --- /dev/null +++ b/third_party/vqsort/vqsort_i32a.S @@ -0,0 +1,24732 @@ + .text + .globl __popcountdi2 + .section .text._ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, @function +_ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0: +.LFB18781: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + movq %rcx, %r14 + pushq %r13 + .cfi_offset 13, -40 + movq %rsi, %r13 + pushq %r12 + .cfi_offset 12, -48 + movq %rdi, %r12 + pushq %rbx + subq $88, %rsp + .cfi_offset 3, -56 + movq %rdx, -120(%rbp) + movaps %xmm0, -80(%rbp) + movaps %xmm1, -64(%rbp) + movaps %xmm0, -112(%rbp) + movaps %xmm1, -96(%rbp) + cmpq $3, %rsi + jbe .L32 + movl $4, %r15d + xorl %ebx, %ebx + jmp .L11 + .p2align 4,,10 + .p2align 3 +.L3: + movdqa -80(%rbp), %xmm5 + movmskps %xmm1, %edi + movups %xmm5, (%r12,%rbx,4) + call __popcountdi2@PLT + cltq + addq %rax, %rbx + leaq 4(%r15), %rax + cmpq %r13, %rax + ja .L88 + movq %rax, %r15 +.L11: + movdqu -16(%r12,%r15,4), %xmm1 + movdqu -16(%r12,%r15,4), %xmm0 + leaq -4(%r15), %rdx + pcmpeqd -96(%rbp), %xmm0 + pcmpeqd -112(%rbp), %xmm1 + movdqa %xmm0, %xmm2 + por %xmm1, %xmm0 + movmskps %xmm0, %eax + cmpl $15, %eax + je .L3 + pcmpeqd %xmm0, %xmm0 + pxor %xmm0, %xmm2 + pandn %xmm2, %xmm1 + movmskps %xmm1, %eax + rep bsfl %eax, %eax + cltq + addq %rdx, %rax + movd (%r12,%rax,4), %xmm3 + movq -120(%rbp), %rax + pshufd $0, %xmm3, %xmm0 + movaps %xmm0, (%rax) + leaq 4(%rbx), %rax + cmpq %rdx, %rax + ja .L4 + .p2align 4,,10 + .p2align 3 +.L5: + movdqa -64(%rbp), %xmm4 + movq %rax, %rbx + movups %xmm4, -16(%r12,%rax,4) + addq $4, %rax + cmpq %rdx, %rax + jbe .L5 +.L4: + subq %rbx, %rdx + leaq 0(,%rbx,4), %rcx + movd %edx, %xmm3 + pshufd $0, %xmm3, %xmm0 + pcmpgtd .LC0(%rip), %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L6 + movdqa -64(%rbp), %xmm3 + movd %xmm3, (%r12,%rbx,4) +.L6: + pshufd $85, %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L7 + pshufd $85, -64(%rbp), %xmm1 + movd %xmm1, 4(%r12,%rcx) +.L7: + movdqa %xmm0, %xmm1 + punpckhdq %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L8 + movdqa -64(%rbp), %xmm3 + movdqa %xmm3, %xmm1 + punpckhdq %xmm3, %xmm1 + movd %xmm1, 8(%r12,%rcx) +.L8: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + jne .L89 +.L21: + addq $88, %rsp + xorl %eax, %eax + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L89: + .cfi_restore_state + pshufd $255, -64(%rbp), %xmm0 + movd %xmm0, 12(%r12,%rcx) + jmp .L21 + .p2align 4,,10 + .p2align 3 +.L88: + movq %r13, %r8 + leaq 0(,%r15,4), %rsi + leaq 0(,%rbx,4), %r9 + subq %r15, %r8 +.L2: + testq %r8, %r8 + je .L15 + leaq 0(,%r8,4), %rdx + addq %r12, %rsi + movq %r14, %rdi + movq %r9, -112(%rbp) + movq %r8, -96(%rbp) + call memcpy@PLT + movq -96(%rbp), %r8 + movq -112(%rbp), %r9 +.L15: + movd %r8d, %xmm3 + movdqa (%r14), %xmm2 + movdqa -80(%rbp), %xmm1 + pshufd $0, %xmm3, %xmm0 + movdqa .LC0(%rip), %xmm3 + pcmpeqd %xmm2, %xmm1 + pcmpeqd -64(%rbp), %xmm2 + pcmpgtd %xmm3, %xmm0 + movdqa %xmm0, %xmm5 + pand %xmm1, %xmm5 + por %xmm2, %xmm1 + pcmpeqd %xmm2, %xmm2 + movdqa %xmm2, %xmm4 + pxor %xmm0, %xmm4 + por %xmm4, %xmm1 + movmskps %xmm1, %eax + cmpl $15, %eax + jne .L90 + movd %xmm0, %eax + testl %eax, %eax + je .L22 + movdqa -80(%rbp), %xmm4 + movd %xmm4, (%r12,%r9) +.L22: + pshufd $85, %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L23 + pshufd $85, -80(%rbp), %xmm1 + movd %xmm1, 4(%r12,%r9) +.L23: + movdqa %xmm0, %xmm1 + punpckhdq %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L24 + movdqa -80(%rbp), %xmm7 + movdqa %xmm7, %xmm1 + punpckhdq %xmm7, %xmm1 + movd %xmm1, 8(%r12,%r9) +.L24: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + jne .L91 +.L25: + movmskps %xmm5, %edi + call __popcountdi2@PLT + movdqa .LC0(%rip), %xmm3 + movslq %eax, %rdx + addq %rbx, %rdx + leaq 4(%rdx), %rax + cmpq %rax, %r13 + jb .L26 + .p2align 4,,10 + .p2align 3 +.L27: + movdqa -64(%rbp), %xmm2 + movq %rax, %rdx + movups %xmm2, -16(%r12,%rax,4) + addq $4, %rax + cmpq %rax, %r13 + jnb .L27 +.L26: + subq %rdx, %r13 + leaq 0(,%rdx,4), %rcx + movd %r13d, %xmm4 + pshufd $0, %xmm4, %xmm0 + pcmpgtd %xmm3, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L28 + movdqa -64(%rbp), %xmm3 + movd %xmm3, (%r12,%rdx,4) +.L28: + pshufd $85, %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L29 + pshufd $85, -64(%rbp), %xmm1 + movd %xmm1, 4(%r12,%rcx) +.L29: + movdqa %xmm0, %xmm1 + punpckhdq %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L30 + movdqa -64(%rbp), %xmm3 + movdqa %xmm3, %xmm1 + punpckhdq %xmm3, %xmm1 + movd %xmm1, 8(%r12,%rcx) +.L30: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L31 + pshufd $255, -64(%rbp), %xmm0 + movd %xmm0, 12(%r12,%rcx) +.L31: + addq $88, %rsp + movl $1, %eax + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L91: + .cfi_restore_state + pshufd $255, -80(%rbp), %xmm0 + movd %xmm0, 12(%r12,%r9) + jmp .L25 +.L32: + movq %rsi, %r8 + xorl %r9d, %r9d + xorl %esi, %esi + xorl %ebx, %ebx + xorl %r15d, %r15d + jmp .L2 +.L90: + pxor %xmm2, %xmm1 + movmskps %xmm1, %eax + rep bsfl %eax, %eax + cltq + addq %r15, %rax + movd (%r12,%rax,4), %xmm4 + movq -120(%rbp), %rax + pshufd $0, %xmm4, %xmm0 + movaps %xmm0, (%rax) + leaq 4(%rbx), %rax + cmpq %rax, %r15 + jb .L16 + .p2align 4,,10 + .p2align 3 +.L17: + movdqa -64(%rbp), %xmm6 + movq %rax, %rbx + movups %xmm6, -16(%r12,%rax,4) + leaq 4(%rax), %rax + cmpq %r15, %rax + jbe .L17 + leaq 0(,%rbx,4), %r9 +.L16: + movq %r15, %rcx + subq %rbx, %rcx + movd %ecx, %xmm4 + pshufd $0, %xmm4, %xmm0 + pcmpgtd %xmm3, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L18 + movdqa -64(%rbp), %xmm3 + movd %xmm3, (%r12,%r9) +.L18: + pshufd $85, %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L19 + pshufd $85, -64(%rbp), %xmm1 + movd %xmm1, 4(%r12,%r9) +.L19: + movdqa %xmm0, %xmm1 + punpckhdq %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L20 + movdqa -64(%rbp), %xmm3 + movdqa %xmm3, %xmm1 + punpckhdq %xmm3, %xmm1 + movd %xmm1, 8(%r12,%r9) +.L20: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L21 + pshufd $255, -64(%rbp), %xmm0 + movd %xmm0, 12(%r12,%r9) + jmp .L21 + .cfi_endproc +.LFE18781: + .size _ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, .-_ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + .section .text._ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0, @function +_ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0: +.LFB18782: + .cfi_startproc + cmpq %rdx, %rsi + jbe .L92 + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r10 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rax, %rsi + jbe .L92 + movl (%rdi,%rdx,4), %r11d + movd %r11d, %xmm6 + pshufd $0, %xmm6, %xmm0 + jmp .L95 + .p2align 4,,10 + .p2align 3 +.L96: + cmpq %rcx, %rsi + jbe .L92 + movq %rdx, %rax +.L101: + movd (%rdi,%r10,8), %xmm5 + pshufd $0, %xmm5, %xmm1 + pcmpgtd %xmm3, %xmm1 + movmskps %xmm1, %r8d + andl $1, %r8d + jne .L98 +.L97: + cmpq %rdx, %rax + je .L92 + leaq (%rdi,%rax,4), %rdx + movl (%rdx), %ecx + movl %ecx, (%r9) + movl %r11d, (%rdx) + cmpq %rax, %rsi + jbe .L105 + movq %rax, %rdx +.L99: + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r10 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rsi, %rax + jnb .L92 +.L95: + movd (%rdi,%rax,4), %xmm4 + leaq (%rdi,%rdx,4), %r9 + movdqa %xmm0, %xmm3 + pshufd $0, %xmm4, %xmm1 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm0, %xmm2 + movmskps %xmm2, %r8d + andl $1, %r8d + je .L96 + cmpq %rcx, %rsi + jbe .L97 + movdqa %xmm1, %xmm3 + jmp .L101 + .p2align 4,,10 + .p2align 3 +.L98: + cmpq %rcx, %rdx + je .L106 + leaq (%rdi,%rcx,4), %rax + movl (%rax), %edx + movl %edx, (%r9) + movq %rcx, %rdx + movl %r11d, (%rax) + jmp .L99 + .p2align 4,,10 + .p2align 3 +.L92: + ret + .p2align 4,,10 + .p2align 3 +.L105: + ret +.L106: + ret + .cfi_endproc +.LFE18782: + .size _ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0, .-_ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + .section .text._ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0, @function +_ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0: +.LFB18783: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsi, %rax + salq $2, %rax + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + leaq (%rdi,%rax), %r15 + pushq %r14 + .cfi_offset 14, -32 + leaq (%r15,%rax), %r14 + pushq %r13 + .cfi_offset 13, -40 + leaq (%r14,%rax), %r13 + pushq %r12 + .cfi_offset 12, -48 + leaq 0(%r13,%rax), %r12 + pushq %rbx + .cfi_offset 3, -56 + leaq (%r12,%rax), %rbx + leaq (%rbx,%rax), %r11 + leaq (%r11,%rax), %r10 + subq $240, %rsp + leaq (%r10,%rax), %r9 + leaq (%r9,%rax), %r8 + movq %rdi, -264(%rbp) + movq %rsi, -240(%rbp) + movdqu (%r15), %xmm6 + movdqu (%rdi), %xmm12 + leaq (%r8,%rax), %rdi + leaq (%rdi,%rax), %rsi + movdqu 0(%r13), %xmm5 + movdqu (%r14), %xmm14 + movdqa %xmm6, %xmm8 + leaq (%rsi,%rax), %rcx + movdqu (%rbx), %xmm3 + movdqu (%r12), %xmm11 + pcmpgtd %xmm12, %xmm8 + leaq (%rcx,%rax), %rdx + movdqu (%r10), %xmm2 + movdqu (%r11), %xmm10 + movdqu (%rdx), %xmm0 + movdqu (%rsi), %xmm4 + movq %rdx, -248(%rbp) + addq %rax, %rdx + movdqu (%rdx), %xmm15 + movdqu (%r8), %xmm1 + addq %rdx, %rax + movq %rdx, -256(%rbp) + movdqa %xmm8, %xmm13 + movdqu (%r9), %xmm7 + movdqu (%rdi), %xmm9 + pandn %xmm6, %xmm13 + movaps %xmm15, -112(%rbp) + pand %xmm8, %xmm6 + movdqa %xmm13, %xmm15 + movdqa %xmm12, %xmm13 + pand %xmm8, %xmm13 + por %xmm15, %xmm13 + movdqa %xmm8, %xmm15 + pandn %xmm12, %xmm15 + movdqa %xmm5, %xmm12 + pcmpgtd %xmm14, %xmm12 + por %xmm15, %xmm6 + movdqa %xmm12, %xmm8 + pandn %xmm5, %xmm8 + pand %xmm12, %xmm5 + movdqa %xmm8, %xmm15 + movdqa %xmm14, %xmm8 + pand %xmm12, %xmm8 + por %xmm15, %xmm8 + movdqa %xmm12, %xmm15 + pandn %xmm14, %xmm15 + movdqa %xmm3, %xmm14 + pcmpgtd %xmm11, %xmm14 + por %xmm15, %xmm5 + movdqa %xmm14, %xmm12 + pandn %xmm3, %xmm12 + pand %xmm14, %xmm3 + movdqa %xmm12, %xmm15 + movdqa %xmm11, %xmm12 + pand %xmm14, %xmm12 + por %xmm15, %xmm12 + movdqa %xmm14, %xmm15 + pandn %xmm11, %xmm15 + movdqa %xmm2, %xmm11 + pcmpgtd %xmm10, %xmm11 + por %xmm15, %xmm3 + movaps %xmm3, -64(%rbp) + movdqa %xmm10, %xmm3 + movdqa %xmm11, %xmm14 + pand %xmm11, %xmm3 + pandn %xmm2, %xmm14 + pand %xmm11, %xmm2 + por %xmm14, %xmm3 + movdqa %xmm11, %xmm14 + pandn %xmm10, %xmm14 + movdqa %xmm1, %xmm10 + pcmpgtd %xmm7, %xmm10 + por %xmm14, %xmm2 + movdqa %xmm10, %xmm11 + pandn %xmm1, %xmm11 + pand %xmm10, %xmm1 + movdqa %xmm11, %xmm14 + movdqa %xmm7, %xmm11 + pand %xmm10, %xmm11 + por %xmm14, %xmm11 + movdqa %xmm10, %xmm14 + pandn %xmm7, %xmm14 + movdqa %xmm4, %xmm7 + pcmpgtd %xmm9, %xmm7 + por %xmm14, %xmm1 + movaps %xmm1, -80(%rbp) + movdqa %xmm7, %xmm10 + movdqa %xmm7, %xmm1 + movdqa %xmm9, %xmm7 + pandn %xmm4, %xmm10 + pand %xmm1, %xmm7 + pand %xmm1, %xmm4 + por %xmm10, %xmm7 + movdqa %xmm1, %xmm10 + movdqa %xmm0, %xmm1 + pandn %xmm9, %xmm10 + por %xmm10, %xmm4 + movdqu (%rcx), %xmm10 + pcmpgtd %xmm10, %xmm1 + movdqu (%rcx), %xmm10 + movdqu (%rcx), %xmm15 + movdqa %xmm1, %xmm9 + pand %xmm1, %xmm10 + pandn %xmm0, %xmm9 + pand %xmm1, %xmm0 + por %xmm9, %xmm10 + movdqa %xmm1, %xmm9 + movdqu (%rax), %xmm1 + pandn %xmm15, %xmm9 + movdqa -112(%rbp), %xmm15 + por %xmm9, %xmm0 + pcmpgtd %xmm15, %xmm1 + movaps %xmm0, -96(%rbp) + movdqu (%rax), %xmm0 + movdqa %xmm1, %xmm9 + pandn %xmm0, %xmm9 + movdqa %xmm15, %xmm0 + pand %xmm1, %xmm0 + por %xmm9, %xmm0 + movdqa %xmm1, %xmm9 + pandn %xmm15, %xmm9 + movdqu (%rax), %xmm15 + pand %xmm15, %xmm1 + movdqa %xmm13, %xmm15 + por %xmm9, %xmm1 + movdqa %xmm8, %xmm9 + pcmpgtd %xmm13, %xmm9 + movdqa %xmm9, %xmm14 + pandn %xmm8, %xmm9 + pand %xmm14, %xmm15 + pand %xmm14, %xmm8 + por %xmm15, %xmm9 + movdqa %xmm14, %xmm15 + movdqa %xmm6, %xmm14 + pandn %xmm13, %xmm15 + movdqa %xmm5, %xmm13 + pcmpgtd %xmm6, %xmm13 + por %xmm8, %xmm15 + movdqa %xmm13, %xmm8 + pand %xmm13, %xmm14 + pandn %xmm5, %xmm8 + pand %xmm13, %xmm5 + por %xmm14, %xmm8 + movdqa %xmm13, %xmm14 + pandn %xmm6, %xmm14 + movdqa %xmm3, %xmm6 + pcmpgtd %xmm12, %xmm6 + por %xmm14, %xmm5 + movdqa -64(%rbp), %xmm14 + movaps %xmm5, -112(%rbp) + movdqa %xmm12, %xmm5 + movdqa %xmm6, %xmm13 + pand %xmm6, %xmm5 + pandn %xmm3, %xmm13 + pand %xmm6, %xmm3 + por %xmm13, %xmm5 + movdqa %xmm6, %xmm13 + movdqa %xmm14, %xmm6 + pandn %xmm12, %xmm13 + movdqa %xmm2, %xmm12 + pcmpgtd %xmm14, %xmm12 + por %xmm13, %xmm3 + movdqa %xmm12, %xmm13 + pand %xmm12, %xmm6 + pandn %xmm2, %xmm13 + pand %xmm12, %xmm2 + por %xmm13, %xmm6 + movdqa %xmm12, %xmm13 + pandn %xmm14, %xmm13 + movdqa %xmm11, %xmm14 + por %xmm13, %xmm2 + movdqa %xmm7, %xmm13 + pcmpgtd %xmm11, %xmm13 + movdqa %xmm13, %xmm12 + pand %xmm13, %xmm14 + pandn %xmm7, %xmm12 + pand %xmm13, %xmm7 + por %xmm14, %xmm12 + movdqa %xmm13, %xmm14 + pandn %xmm11, %xmm14 + movdqa %xmm4, %xmm11 + por %xmm14, %xmm7 + movdqa -80(%rbp), %xmm14 + movaps %xmm7, -128(%rbp) + pcmpgtd %xmm14, %xmm11 + movdqa %xmm14, %xmm13 + movdqa %xmm11, %xmm7 + pand %xmm11, %xmm13 + pandn %xmm4, %xmm7 + pand %xmm11, %xmm4 + por %xmm13, %xmm7 + movdqa %xmm11, %xmm13 + movdqa %xmm0, %xmm11 + pcmpgtd %xmm10, %xmm11 + pandn %xmm14, %xmm13 + movdqa -96(%rbp), %xmm14 + por %xmm13, %xmm4 + movaps %xmm4, -80(%rbp) + movdqa %xmm10, %xmm4 + movdqa %xmm11, %xmm13 + pand %xmm11, %xmm4 + pandn %xmm0, %xmm13 + pand %xmm11, %xmm0 + por %xmm13, %xmm4 + movdqa %xmm11, %xmm13 + movdqa %xmm1, %xmm11 + pcmpgtd %xmm14, %xmm11 + pandn %xmm10, %xmm13 + movdqa %xmm14, %xmm10 + por %xmm13, %xmm0 + movdqa %xmm11, %xmm13 + pand %xmm11, %xmm10 + pandn %xmm1, %xmm13 + pand %xmm11, %xmm1 + por %xmm13, %xmm10 + movdqa %xmm11, %xmm13 + pandn %xmm14, %xmm13 + movdqa %xmm9, %xmm14 + por %xmm13, %xmm1 + movdqa %xmm5, %xmm13 + pcmpgtd %xmm9, %xmm13 + movdqa %xmm13, %xmm11 + pand %xmm13, %xmm14 + pandn %xmm5, %xmm11 + pand %xmm13, %xmm5 + por %xmm14, %xmm11 + movdqa %xmm13, %xmm14 + movdqa %xmm8, %xmm13 + pandn %xmm9, %xmm14 + movdqa %xmm6, %xmm9 + pcmpgtd %xmm8, %xmm9 + por %xmm5, %xmm14 + movdqa %xmm9, %xmm5 + pand %xmm9, %xmm13 + pandn %xmm6, %xmm5 + pand %xmm9, %xmm6 + por %xmm13, %xmm5 + movdqa %xmm9, %xmm13 + movdqa %xmm15, %xmm9 + pandn %xmm8, %xmm13 + movdqa %xmm3, %xmm8 + pcmpgtd %xmm15, %xmm8 + por %xmm13, %xmm6 + movaps %xmm6, -96(%rbp) + movdqa %xmm8, %xmm6 + pand %xmm8, %xmm9 + pandn %xmm3, %xmm6 + pand %xmm8, %xmm3 + por %xmm9, %xmm6 + movdqa %xmm8, %xmm9 + movdqa %xmm2, %xmm8 + pandn %xmm15, %xmm9 + movdqa -112(%rbp), %xmm15 + por %xmm9, %xmm3 + pcmpgtd %xmm15, %xmm8 + movaps %xmm3, -144(%rbp) + movdqa %xmm15, %xmm9 + movdqa %xmm8, %xmm3 + pand %xmm8, %xmm9 + pandn %xmm2, %xmm3 + pand %xmm8, %xmm2 + por %xmm9, %xmm3 + movdqa %xmm8, %xmm9 + movdqa %xmm12, %xmm8 + pandn %xmm15, %xmm9 + movdqa -128(%rbp), %xmm15 + por %xmm9, %xmm2 + movaps %xmm2, -64(%rbp) + movdqa %xmm4, %xmm2 + pcmpgtd %xmm12, %xmm2 + movdqa %xmm2, %xmm9 + pand %xmm2, %xmm8 + pandn %xmm4, %xmm9 + pand %xmm2, %xmm4 + por %xmm9, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm10, %xmm2 + pcmpgtd %xmm7, %xmm2 + pandn %xmm12, %xmm9 + por %xmm9, %xmm4 + movdqa %xmm7, %xmm9 + movdqa %xmm2, %xmm12 + pand %xmm2, %xmm9 + pandn %xmm10, %xmm12 + pand %xmm2, %xmm10 + por %xmm12, %xmm9 + movdqa %xmm2, %xmm12 + movdqa %xmm15, %xmm2 + pandn %xmm7, %xmm12 + movdqa %xmm0, %xmm7 + pcmpgtd %xmm15, %xmm7 + por %xmm12, %xmm10 + movdqa %xmm7, %xmm12 + pand %xmm7, %xmm2 + pandn %xmm0, %xmm12 + pand %xmm7, %xmm0 + por %xmm12, %xmm2 + movdqa %xmm7, %xmm12 + pandn %xmm15, %xmm12 + movdqa -80(%rbp), %xmm15 + por %xmm12, %xmm0 + movdqa %xmm1, %xmm12 + pcmpgtd %xmm15, %xmm12 + movdqa %xmm12, %xmm7 + pandn %xmm1, %xmm7 + pand %xmm12, %xmm1 + movdqa %xmm7, %xmm13 + movdqa %xmm15, %xmm7 + pand %xmm12, %xmm7 + por %xmm13, %xmm7 + movdqa %xmm12, %xmm13 + movdqa %xmm8, %xmm12 + pcmpgtd %xmm11, %xmm12 + pandn %xmm15, %xmm13 + por %xmm13, %xmm1 + movdqa %xmm12, %xmm15 + pandn %xmm8, %xmm15 + pand %xmm12, %xmm8 + movdqa %xmm15, %xmm13 + movdqa %xmm11, %xmm15 + pand %xmm12, %xmm15 + por %xmm13, %xmm15 + movaps %xmm15, -112(%rbp) + movdqa %xmm12, %xmm15 + movdqa %xmm5, %xmm12 + pandn %xmm11, %xmm15 + movdqa %xmm15, %xmm13 + movdqa %xmm8, %xmm15 + movdqa %xmm9, %xmm8 + pcmpgtd %xmm5, %xmm8 + por %xmm13, %xmm15 + movdqa -96(%rbp), %xmm13 + movdqa %xmm8, %xmm11 + pand %xmm8, %xmm12 + pandn %xmm9, %xmm11 + pand %xmm8, %xmm9 + por %xmm11, %xmm12 + movdqa %xmm8, %xmm11 + movdqa %xmm2, %xmm8 + pcmpgtd %xmm6, %xmm8 + pandn %xmm5, %xmm11 + movdqa %xmm6, %xmm5 + movaps %xmm12, -128(%rbp) + por %xmm11, %xmm9 + movdqa %xmm8, %xmm11 + pand %xmm8, %xmm5 + pandn %xmm2, %xmm11 + pand %xmm8, %xmm2 + por %xmm11, %xmm5 + movdqa %xmm8, %xmm11 + movdqa %xmm7, %xmm8 + pcmpgtd %xmm3, %xmm8 + pandn %xmm6, %xmm11 + movdqa %xmm3, %xmm6 + movaps %xmm5, -80(%rbp) + por %xmm11, %xmm2 + movdqa %xmm8, %xmm11 + pand %xmm8, %xmm6 + pandn %xmm7, %xmm11 + pand %xmm8, %xmm7 + por %xmm11, %xmm6 + movdqa %xmm8, %xmm11 + movdqa %xmm4, %xmm8 + pcmpgtd %xmm14, %xmm8 + pandn %xmm3, %xmm11 + movdqa %xmm14, %xmm3 + por %xmm11, %xmm7 + movdqa %xmm8, %xmm11 + pand %xmm8, %xmm3 + pandn %xmm4, %xmm11 + pand %xmm8, %xmm4 + por %xmm11, %xmm3 + movdqa %xmm8, %xmm11 + movdqa %xmm13, %xmm8 + pandn %xmm14, %xmm11 + movdqa -144(%rbp), %xmm14 + por %xmm11, %xmm4 + movdqa %xmm10, %xmm11 + pcmpgtd %xmm13, %xmm11 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm8 + pandn %xmm10, %xmm12 + pand %xmm11, %xmm10 + por %xmm12, %xmm8 + movdqa %xmm11, %xmm12 + movdqa %xmm0, %xmm11 + pcmpgtd %xmm14, %xmm11 + pandn %xmm13, %xmm12 + por %xmm12, %xmm10 + movdqa %xmm14, %xmm12 + movdqa %xmm11, %xmm13 + pand %xmm11, %xmm12 + pandn %xmm0, %xmm13 + pand %xmm11, %xmm0 + por %xmm13, %xmm12 + movdqa %xmm11, %xmm13 + pandn %xmm14, %xmm13 + movdqa -64(%rbp), %xmm14 + por %xmm13, %xmm0 + movdqa %xmm1, %xmm13 + pcmpgtd %xmm14, %xmm13 + movdqa %xmm13, %xmm11 + movdqa %xmm13, %xmm5 + pandn -64(%rbp), %xmm5 + pandn %xmm1, %xmm11 + pand %xmm13, %xmm1 + pand %xmm13, %xmm14 + por %xmm5, %xmm1 + por %xmm14, %xmm11 + movdqa %xmm8, %xmm13 + movaps %xmm1, -192(%rbp) + movdqa %xmm2, %xmm1 + pcmpgtd %xmm8, %xmm1 + movdqa %xmm1, %xmm14 + pand %xmm1, %xmm13 + pandn %xmm2, %xmm14 + pand %xmm1, %xmm2 + por %xmm14, %xmm13 + movdqa %xmm1, %xmm14 + movdqa %xmm12, %xmm1 + pandn %xmm8, %xmm14 + movdqa %xmm9, %xmm8 + pcmpgtd %xmm12, %xmm8 + por %xmm14, %xmm2 + movdqa %xmm8, %xmm14 + pand %xmm8, %xmm1 + pandn %xmm9, %xmm14 + pand %xmm8, %xmm9 + por %xmm14, %xmm1 + movdqa %xmm8, %xmm14 + movdqa %xmm4, %xmm8 + pcmpgtd %xmm6, %xmm8 + pandn %xmm12, %xmm14 + por %xmm14, %xmm9 + movdqa %xmm6, %xmm14 + movdqa %xmm8, %xmm5 + pand %xmm8, %xmm14 + pandn %xmm4, %xmm5 + pand %xmm8, %xmm4 + por %xmm5, %xmm14 + movdqa %xmm8, %xmm5 + movdqa %xmm11, %xmm8 + pandn %xmm6, %xmm5 + movdqa %xmm7, %xmm6 + pcmpgtd %xmm11, %xmm6 + por %xmm5, %xmm4 + movdqa %xmm6, %xmm5 + pand %xmm6, %xmm8 + pandn %xmm7, %xmm5 + pand %xmm6, %xmm7 + por %xmm5, %xmm8 + movdqa %xmm6, %xmm5 + movdqa %xmm10, %xmm6 + pandn %xmm11, %xmm5 + movdqa %xmm0, %xmm11 + pcmpgtd %xmm10, %xmm11 + por %xmm5, %xmm7 + movdqa %xmm11, %xmm5 + pand %xmm11, %xmm6 + pandn %xmm0, %xmm5 + pand %xmm11, %xmm0 + por %xmm5, %xmm6 + movdqa %xmm11, %xmm5 + movdqa %xmm15, %xmm11 + pcmpgtd %xmm3, %xmm11 + pandn %xmm10, %xmm5 + movdqa %xmm3, %xmm10 + por %xmm5, %xmm0 + movaps %xmm0, -64(%rbp) + movdqa -128(%rbp), %xmm0 + movdqa %xmm11, %xmm5 + pand %xmm11, %xmm10 + pandn %xmm15, %xmm5 + pand %xmm11, %xmm15 + por %xmm5, %xmm10 + movdqa %xmm11, %xmm5 + pandn %xmm3, %xmm5 + movdqa %xmm0, %xmm3 + por %xmm5, %xmm15 + movdqa -80(%rbp), %xmm5 + movdqa %xmm5, %xmm11 + pcmpgtd %xmm0, %xmm11 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm3 + pandn %xmm5, %xmm12 + movdqa %xmm11, %xmm5 + pandn %xmm0, %xmm5 + por %xmm12, %xmm3 + movdqa %xmm5, %xmm12 + movdqa -80(%rbp), %xmm5 + movdqa %xmm3, %xmm0 + pand %xmm11, %xmm5 + movdqa %xmm10, %xmm11 + pcmpgtd %xmm3, %xmm11 + por %xmm12, %xmm5 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm0 + pandn %xmm10, %xmm12 + pand %xmm11, %xmm10 + por %xmm0, %xmm12 + movdqa -64(%rbp), %xmm0 + movaps %xmm12, -128(%rbp) + movdqa %xmm11, %xmm12 + movdqa %xmm6, %xmm11 + pcmpgtd %xmm8, %xmm11 + pandn %xmm3, %xmm12 + movdqa %xmm8, %xmm3 + por %xmm12, %xmm10 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm3 + pandn %xmm6, %xmm12 + pand %xmm11, %xmm6 + por %xmm12, %xmm3 + movdqa %xmm11, %xmm12 + movdqa %xmm15, %xmm11 + pcmpgtd %xmm5, %xmm11 + pandn %xmm8, %xmm12 + movdqa %xmm5, %xmm8 + por %xmm12, %xmm6 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm8 + pandn %xmm15, %xmm12 + pand %xmm11, %xmm15 + por %xmm12, %xmm8 + movdqa %xmm11, %xmm12 + movdqa %xmm7, %xmm11 + pandn %xmm5, %xmm12 + movdqa %xmm0, %xmm5 + pcmpgtd %xmm7, %xmm5 + por %xmm12, %xmm15 + movdqa %xmm5, %xmm12 + pand %xmm5, %xmm11 + pandn %xmm0, %xmm12 + pand %xmm5, %xmm0 + por %xmm12, %xmm11 + movdqa %xmm5, %xmm12 + pandn %xmm7, %xmm12 + movdqa %xmm8, %xmm7 + por %xmm12, %xmm0 + movdqa %xmm1, %xmm12 + movaps %xmm0, -208(%rbp) + movdqa %xmm10, %xmm0 + pcmpgtd %xmm13, %xmm12 + cmpq $1, -240(%rbp) + pcmpgtd %xmm8, %xmm0 + movdqa %xmm0, %xmm5 + pand %xmm0, %xmm7 + pandn %xmm10, %xmm5 + pand %xmm0, %xmm10 + por %xmm5, %xmm7 + movdqa %xmm0, %xmm5 + movdqa %xmm12, %xmm0 + pandn %xmm8, %xmm5 + pandn %xmm1, %xmm0 + pand %xmm12, %xmm1 + movaps %xmm7, -144(%rbp) + por %xmm5, %xmm10 + movdqa %xmm13, %xmm5 + movdqa %xmm9, %xmm7 + pand %xmm12, %xmm5 + movdqa %xmm11, %xmm8 + por %xmm0, %xmm5 + movdqa %xmm12, %xmm0 + pandn %xmm13, %xmm0 + movdqa %xmm6, %xmm13 + por %xmm0, %xmm1 + pcmpgtd %xmm11, %xmm13 + movdqa %xmm2, %xmm0 + pcmpgtd %xmm9, %xmm0 + movdqa %xmm1, %xmm12 + pand %xmm13, %xmm8 + movdqa %xmm0, %xmm1 + pand %xmm0, %xmm7 + pandn %xmm2, %xmm1 + pand %xmm0, %xmm2 + por %xmm1, %xmm7 + movdqa %xmm0, %xmm1 + movdqa %xmm13, %xmm0 + pandn %xmm9, %xmm1 + pandn %xmm6, %xmm0 + movdqa %xmm14, %xmm9 + por %xmm1, %xmm2 + movdqa %xmm15, %xmm1 + por %xmm0, %xmm8 + pcmpgtd %xmm14, %xmm1 + movdqa %xmm13, %xmm0 + pand %xmm6, %xmm13 + pandn %xmm11, %xmm0 + movdqa %xmm3, %xmm6 + movdqa %xmm7, %xmm11 + por %xmm0, %xmm13 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm9 + pandn %xmm15, %xmm0 + pand %xmm1, %xmm15 + por %xmm0, %xmm9 + movdqa %xmm1, %xmm0 + pandn %xmm14, %xmm0 + movdqa %xmm9, %xmm14 + por %xmm0, %xmm15 + movdqa %xmm4, %xmm0 + pcmpgtd %xmm3, %xmm0 + movdqa %xmm0, %xmm1 + pand %xmm0, %xmm6 + pandn %xmm4, %xmm1 + pand %xmm0, %xmm4 + por %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + movdqa %xmm5, %xmm0 + pcmpgtd %xmm9, %xmm0 + pcmpgtd %xmm6, %xmm11 + pandn %xmm3, %xmm1 + por %xmm1, %xmm4 + movdqa %xmm12, %xmm3 + movdqa %xmm0, %xmm1 + pand %xmm0, %xmm14 + pandn %xmm5, %xmm1 + pand %xmm0, %xmm5 + por %xmm1, %xmm14 + movdqa %xmm0, %xmm1 + pandn %xmm9, %xmm1 + movdqa %xmm6, %xmm9 + por %xmm1, %xmm5 + movdqa %xmm15, %xmm1 + pand %xmm11, %xmm9 + pcmpgtd %xmm12, %xmm1 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm3 + pandn %xmm15, %xmm0 + por %xmm0, %xmm3 + movdqa %xmm1, %xmm0 + pand %xmm15, %xmm1 + pandn %xmm12, %xmm0 + por %xmm0, %xmm1 + movdqa %xmm11, %xmm0 + pandn %xmm7, %xmm0 + pand %xmm11, %xmm7 + por %xmm0, %xmm9 + movdqa %xmm11, %xmm0 + pandn %xmm6, %xmm0 + por %xmm0, %xmm7 + movdqa %xmm4, %xmm0 + pcmpgtd %xmm2, %xmm0 + movdqa %xmm7, %xmm11 + movdqa %xmm2, %xmm7 + movdqa %xmm0, %xmm6 + movdqa %xmm0, %xmm12 + pand %xmm0, %xmm7 + pandn %xmm2, %xmm6 + movdqa %xmm10, %xmm2 + pand %xmm4, %xmm0 + pcmpgtd %xmm14, %xmm2 + por %xmm6, %xmm0 + pandn %xmm4, %xmm12 + movdqa %xmm14, %xmm6 + por %xmm12, %xmm7 + movdqa %xmm2, %xmm4 + pand %xmm2, %xmm6 + pandn %xmm10, %xmm4 + pand %xmm2, %xmm10 + por %xmm4, %xmm6 + movdqa %xmm2, %xmm4 + movdqa %xmm3, %xmm2 + pcmpgtd %xmm5, %xmm2 + pandn %xmm14, %xmm4 + movaps %xmm6, -160(%rbp) + movdqa %xmm5, %xmm6 + por %xmm4, %xmm10 + movaps %xmm10, -64(%rbp) + movdqa %xmm8, %xmm10 + movdqa %xmm2, %xmm4 + pand %xmm2, %xmm6 + pandn %xmm3, %xmm4 + pand %xmm2, %xmm3 + por %xmm4, %xmm6 + movdqa %xmm2, %xmm4 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm9, %xmm2 + pandn %xmm5, %xmm4 + movaps %xmm6, -80(%rbp) + movdqa %xmm9, %xmm5 + por %xmm4, %xmm3 + movdqa %xmm7, %xmm6 + pcmpgtd %xmm11, %xmm6 + movdqa %xmm2, %xmm4 + pand %xmm2, %xmm5 + pandn %xmm1, %xmm4 + pand %xmm2, %xmm1 + por %xmm4, %xmm5 + movdqa %xmm2, %xmm4 + movdqa %xmm11, %xmm2 + pandn %xmm9, %xmm4 + pand %xmm6, %xmm2 + por %xmm4, %xmm1 + movdqa %xmm6, %xmm4 + pandn %xmm7, %xmm4 + pand %xmm6, %xmm7 + por %xmm4, %xmm2 + movdqa %xmm6, %xmm4 + pandn %xmm11, %xmm4 + movdqa %xmm2, %xmm9 + por %xmm4, %xmm7 + pcmpgtd %xmm1, %xmm9 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm8, %xmm4 + movdqa %xmm4, %xmm6 + pand %xmm4, %xmm10 + pandn %xmm0, %xmm6 + pand %xmm4, %xmm0 + por %xmm6, %xmm10 + movdqa %xmm4, %xmm6 + movdqa %xmm5, %xmm4 + pcmpgtd %xmm3, %xmm4 + pandn %xmm8, %xmm6 + movdqa %xmm3, %xmm8 + por %xmm6, %xmm0 + movdqa %xmm4, %xmm6 + pand %xmm4, %xmm8 + pandn %xmm5, %xmm6 + pand %xmm4, %xmm5 + por %xmm6, %xmm8 + movdqa %xmm4, %xmm6 + movdqa %xmm9, %xmm4 + pandn %xmm3, %xmm6 + movdqa %xmm1, %xmm3 + pandn %xmm2, %xmm4 + movaps %xmm8, -96(%rbp) + pand %xmm9, %xmm3 + por %xmm6, %xmm5 + pand %xmm9, %xmm2 + por %xmm4, %xmm3 + movdqa %xmm9, %xmm4 + movdqa %xmm5, %xmm12 + pandn %xmm1, %xmm4 + por %xmm4, %xmm2 + jbe .L112 + movdqa -112(%rbp), %xmm5 + pshufd $177, %xmm13, %xmm14 + pshufd $177, -192(%rbp), %xmm13 + movdqa %xmm13, %xmm8 + pshufd $177, %xmm3, %xmm6 + pshufd $177, %xmm0, %xmm0 + pshufd $177, %xmm7, %xmm7 + pshufd $177, -208(%rbp), %xmm15 + pcmpgtd %xmm5, %xmm8 + movaps %xmm6, -176(%rbp) + movdqa %xmm5, %xmm4 + movdqa %xmm15, %xmm9 + movdqa -160(%rbp), %xmm3 + pshufd $177, %xmm10, %xmm10 + pshufd $177, %xmm2, %xmm2 + movdqa %xmm8, %xmm6 + movdqa %xmm8, %xmm1 + pand %xmm8, %xmm4 + pandn %xmm5, %xmm6 + movdqa -128(%rbp), %xmm5 + pandn %xmm13, %xmm1 + pand %xmm13, %xmm8 + por %xmm1, %xmm4 + movaps %xmm6, -208(%rbp) + movdqa -144(%rbp), %xmm6 + pcmpgtd %xmm5, %xmm9 + movdqa %xmm5, %xmm11 + movaps %xmm4, -192(%rbp) + movdqa %xmm6, %xmm4 + movdqa %xmm9, %xmm1 + pand %xmm9, %xmm11 + pandn %xmm15, %xmm1 + por %xmm1, %xmm11 + movdqa %xmm9, %xmm1 + pand %xmm15, %xmm9 + pandn %xmm5, %xmm1 + movdqa %xmm14, %xmm5 + pcmpgtd %xmm6, %xmm5 + movaps %xmm1, -224(%rbp) + movdqa %xmm5, %xmm1 + pand %xmm5, %xmm4 + pandn %xmm14, %xmm1 + por %xmm1, %xmm4 + movdqa %xmm5, %xmm1 + pand %xmm14, %xmm5 + pandn %xmm6, %xmm1 + movaps %xmm4, -112(%rbp) + movdqa %xmm3, %xmm6 + movaps %xmm1, -288(%rbp) + movdqa %xmm0, %xmm1 + pcmpgtd %xmm3, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm6 + pandn %xmm0, %xmm4 + por %xmm4, %xmm6 + movdqa %xmm1, %xmm4 + pand %xmm0, %xmm1 + pandn %xmm3, %xmm4 + movaps %xmm6, -128(%rbp) + movdqa -64(%rbp), %xmm6 + movaps %xmm4, -304(%rbp) + movdqa %xmm10, %xmm4 + por -304(%rbp), %xmm1 + pcmpgtd %xmm6, %xmm4 + pshufd $177, %xmm1, %xmm1 + movdqa %xmm4, %xmm3 + pandn %xmm10, %xmm3 + pand %xmm4, %xmm10 + movaps %xmm3, -320(%rbp) + movdqa %xmm4, %xmm3 + pand -64(%rbp), %xmm4 + por -320(%rbp), %xmm4 + pandn %xmm6, %xmm3 + por %xmm3, %xmm10 + movdqa %xmm7, %xmm3 + pshufd $177, %xmm4, %xmm4 + movaps %xmm10, -144(%rbp) + movdqa -80(%rbp), %xmm10 + pcmpgtd %xmm10, %xmm3 + movdqa %xmm3, %xmm6 + pandn %xmm7, %xmm3 + movaps %xmm3, -336(%rbp) + movdqa %xmm6, %xmm3 + pand %xmm6, %xmm7 + pand -80(%rbp), %xmm6 + pandn %xmm10, %xmm3 + por %xmm3, %xmm7 + movdqa %xmm2, %xmm3 + movaps %xmm7, -160(%rbp) + movdqa -96(%rbp), %xmm7 + pcmpgtd %xmm7, %xmm3 + movdqa %xmm3, %xmm10 + pandn %xmm2, %xmm3 + movaps %xmm3, -352(%rbp) + movdqa %xmm10, %xmm3 + pand %xmm10, %xmm2 + pandn %xmm7, %xmm3 + movdqa -176(%rbp), %xmm7 + por %xmm3, %xmm2 + pcmpgtd %xmm12, %xmm7 + movdqa %xmm7, %xmm3 + pandn -176(%rbp), %xmm3 + movaps %xmm3, -368(%rbp) + movdqa %xmm7, %xmm3 + pandn %xmm12, %xmm3 + movaps %xmm3, -384(%rbp) + movdqa -176(%rbp), %xmm3 + pand %xmm7, %xmm3 + pand %xmm12, %xmm7 + por -384(%rbp), %xmm3 + por -336(%rbp), %xmm6 + por -368(%rbp), %xmm7 + movdqa -192(%rbp), %xmm13 + pand -96(%rbp), %xmm10 + pshufd $177, %xmm7, %xmm7 + pshufd $177, %xmm6, %xmm6 + por -352(%rbp), %xmm10 + movdqa %xmm7, %xmm0 + por -288(%rbp), %xmm5 + por -224(%rbp), %xmm9 + pcmpgtd %xmm13, %xmm0 + pshufd $177, %xmm10, %xmm14 + por -208(%rbp), %xmm8 + pshufd $177, %xmm9, %xmm15 + movdqa %xmm13, %xmm9 + movaps %xmm14, -64(%rbp) + pshufd $177, %xmm5, %xmm5 + pshufd $177, %xmm8, %xmm12 + movdqa %xmm3, %xmm8 + movdqa %xmm0, %xmm10 + pandn %xmm7, %xmm0 + pand %xmm10, %xmm9 + por %xmm0, %xmm9 + movdqa %xmm10, %xmm0 + pand %xmm7, %xmm10 + pandn %xmm13, %xmm0 + movdqa %xmm12, %xmm13 + pcmpgtd %xmm3, %xmm13 + movaps %xmm0, -80(%rbp) + movdqa %xmm13, %xmm0 + pand %xmm13, %xmm8 + pandn %xmm12, %xmm0 + por %xmm0, %xmm8 + movdqa %xmm13, %xmm0 + pand %xmm12, %xmm13 + pandn %xmm3, %xmm0 + movaps %xmm8, -224(%rbp) + movdqa %xmm11, %xmm8 + movaps %xmm0, -304(%rbp) + movdqa %xmm14, %xmm0 + pcmpgtd %xmm11, %xmm0 + movdqa %xmm0, %xmm3 + pandn -64(%rbp), %xmm3 + pand %xmm0, %xmm8 + por %xmm3, %xmm8 + movdqa %xmm0, %xmm3 + pand -64(%rbp), %xmm0 + pandn %xmm11, %xmm3 + movdqa %xmm2, %xmm11 + movaps %xmm8, -96(%rbp) + movaps %xmm3, -320(%rbp) + movdqa %xmm15, %xmm3 + por -320(%rbp), %xmm0 + pcmpgtd %xmm2, %xmm3 + pshufd $177, %xmm0, %xmm0 + movdqa %xmm3, %xmm14 + pand %xmm3, %xmm11 + pandn %xmm15, %xmm14 + por %xmm14, %xmm11 + movdqa %xmm3, %xmm14 + pand %xmm15, %xmm3 + movaps %xmm11, -176(%rbp) + movdqa -112(%rbp), %xmm11 + pandn %xmm2, %xmm14 + movdqa %xmm6, %xmm2 + movaps %xmm14, -336(%rbp) + pcmpgtd %xmm11, %xmm2 + movdqa %xmm2, %xmm14 + movdqa %xmm2, %xmm8 + pandn %xmm6, %xmm14 + pand %xmm2, %xmm6 + pandn %xmm11, %xmm8 + movaps %xmm14, -352(%rbp) + movdqa %xmm6, %xmm14 + movdqa %xmm5, %xmm6 + pand -112(%rbp), %xmm2 + por %xmm8, %xmm14 + por -352(%rbp), %xmm2 + movdqa -160(%rbp), %xmm8 + movaps %xmm14, -192(%rbp) + pcmpgtd %xmm8, %xmm6 + pshufd $177, %xmm2, %xmm2 + movdqa %xmm6, %xmm14 + pandn %xmm5, %xmm6 + movaps %xmm6, -368(%rbp) + movdqa %xmm14, %xmm6 + pand %xmm14, %xmm5 + pandn %xmm8, %xmm6 + por %xmm6, %xmm5 + movdqa -128(%rbp), %xmm6 + movaps %xmm5, -208(%rbp) + movdqa %xmm4, %xmm5 + pcmpgtd %xmm6, %xmm5 + movdqa %xmm5, %xmm8 + pandn %xmm4, %xmm5 + movdqa %xmm5, %xmm11 + movdqa %xmm8, %xmm5 + pand %xmm8, %xmm4 + pandn %xmm6, %xmm5 + pand -128(%rbp), %xmm8 + por %xmm5, %xmm4 + movdqa %xmm1, %xmm5 + pcmpgtd -144(%rbp), %xmm5 + movaps %xmm4, -288(%rbp) + por %xmm11, %xmm8 + pshufd $177, %xmm8, %xmm8 + movdqa %xmm5, %xmm6 + movdqa %xmm5, %xmm4 + pandn -144(%rbp), %xmm4 + pandn %xmm1, %xmm6 + por -80(%rbp), %xmm10 + pand %xmm5, %xmm1 + movdqa -288(%rbp), %xmm15 + pand -144(%rbp), %xmm5 + por %xmm4, %xmm1 + por -304(%rbp), %xmm13 + pshufd $177, %xmm10, %xmm11 + movdqa %xmm15, %xmm4 + por -336(%rbp), %xmm3 + por %xmm6, %xmm5 + pshufd $177, %xmm13, %xmm10 + movdqa -96(%rbp), %xmm6 + movaps %xmm11, -128(%rbp) + pshufd $177, %xmm5, %xmm7 + movaps %xmm10, -80(%rbp) + movdqa %xmm9, %xmm10 + movdqa -192(%rbp), %xmm5 + movaps %xmm7, -64(%rbp) + movdqa %xmm8, %xmm7 + pshufd $177, %xmm3, %xmm3 + pand -160(%rbp), %xmm14 + por -368(%rbp), %xmm14 + pcmpgtd %xmm9, %xmm7 + pshufd $177, %xmm14, %xmm14 + movdqa %xmm7, %xmm13 + pand %xmm7, %xmm10 + pandn %xmm8, %xmm13 + por %xmm13, %xmm10 + movdqa %xmm7, %xmm13 + pand %xmm8, %xmm7 + pandn %xmm9, %xmm13 + movdqa %xmm2, %xmm9 + movdqa %xmm10, %xmm8 + pcmpgtd %xmm6, %xmm9 + movaps %xmm13, -304(%rbp) + movdqa %xmm9, %xmm13 + pandn %xmm2, %xmm9 + movdqa %xmm13, %xmm12 + pand %xmm13, %xmm2 + movaps %xmm9, -320(%rbp) + pandn %xmm6, %xmm12 + por %xmm12, %xmm2 + movdqa %xmm11, %xmm12 + pcmpgtd %xmm15, %xmm12 + movdqa %xmm12, %xmm6 + pandn -128(%rbp), %xmm12 + pand %xmm6, %xmm4 + movdqa %xmm4, %xmm9 + por %xmm12, %xmm9 + movdqa %xmm6, %xmm12 + pandn %xmm15, %xmm12 + movdqa -224(%rbp), %xmm15 + movaps %xmm12, -288(%rbp) + movdqa %xmm0, %xmm12 + pcmpgtd %xmm5, %xmm12 + movdqa %xmm12, %xmm4 + pandn %xmm0, %xmm4 + pand %xmm12, %xmm0 + movaps %xmm4, -336(%rbp) + movdqa %xmm12, %xmm4 + pandn %xmm5, %xmm4 + por %xmm4, %xmm0 + movdqa -64(%rbp), %xmm4 + movaps %xmm0, -144(%rbp) + movdqa %xmm4, %xmm11 + pcmpgtd %xmm15, %xmm11 + movdqa %xmm11, %xmm5 + pandn -64(%rbp), %xmm11 + movdqa %xmm11, %xmm4 + movdqa %xmm15, %xmm11 + pand %xmm5, %xmm11 + por %xmm4, %xmm11 + movdqa %xmm5, %xmm4 + pandn %xmm15, %xmm4 + movaps %xmm11, -160(%rbp) + movdqa -176(%rbp), %xmm15 + movdqa %xmm14, %xmm11 + movaps %xmm4, -352(%rbp) + pcmpgtd %xmm15, %xmm11 + movdqa %xmm11, %xmm4 + pandn %xmm14, %xmm4 + pand %xmm11, %xmm14 + movaps %xmm4, -368(%rbp) + movdqa %xmm11, %xmm4 + pandn %xmm15, %xmm4 + movdqa -80(%rbp), %xmm15 + por %xmm4, %xmm14 + movdqa %xmm15, %xmm4 + movaps %xmm14, -112(%rbp) + movdqa %xmm1, %xmm15 + pcmpgtd %xmm1, %xmm4 + movdqa %xmm4, %xmm14 + pandn -80(%rbp), %xmm14 + pand %xmm4, %xmm15 + por %xmm14, %xmm15 + movdqa %xmm4, %xmm14 + pandn %xmm1, %xmm14 + movdqa -208(%rbp), %xmm1 + movaps %xmm14, -384(%rbp) + movdqa %xmm3, %xmm14 + pcmpgtd %xmm1, %xmm14 + movdqa %xmm14, %xmm0 + pandn %xmm3, %xmm0 + pand %xmm14, %xmm3 + movaps %xmm0, -400(%rbp) + movdqa %xmm14, %xmm0 + pandn %xmm1, %xmm0 + por %xmm0, %xmm3 + movaps %xmm3, -224(%rbp) + movdqa -96(%rbp), %xmm3 + movdqa -208(%rbp), %xmm1 + pand -80(%rbp), %xmm4 + pand -64(%rbp), %xmm5 + pand %xmm13, %xmm3 + por -384(%rbp), %xmm4 + por -320(%rbp), %xmm3 + movdqa -400(%rbp), %xmm13 + pand %xmm14, %xmm1 + por -304(%rbp), %xmm7 + pshufd $177, %xmm3, %xmm3 + pand -128(%rbp), %xmm6 + pand -192(%rbp), %xmm12 + por %xmm1, %xmm13 + pshufd $177, %xmm4, %xmm1 + pshufd $177, %xmm7, %xmm7 + movdqa -144(%rbp), %xmm4 + movaps %xmm1, -64(%rbp) + movdqa %xmm3, %xmm1 + por -336(%rbp), %xmm12 + por -288(%rbp), %xmm6 + pcmpgtd %xmm10, %xmm1 + por -352(%rbp), %xmm5 + pand -176(%rbp), %xmm11 + pshufd $177, %xmm12, %xmm12 + pshufd $177, %xmm6, %xmm6 + por -368(%rbp), %xmm11 + pshufd $177, %xmm5, %xmm5 + pshufd $177, %xmm13, %xmm13 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm8 + pshufd $177, %xmm11, %xmm11 + pandn %xmm3, %xmm0 + por %xmm0, %xmm8 + movdqa %xmm1, %xmm0 + pand %xmm3, %xmm1 + pandn %xmm10, %xmm0 + movdqa %xmm7, %xmm10 + pcmpgtd %xmm2, %xmm10 + por %xmm0, %xmm1 + movdqa %xmm2, %xmm0 + movdqa %xmm10, %xmm3 + pand %xmm10, %xmm0 + pandn %xmm7, %xmm3 + por %xmm0, %xmm3 + movdqa %xmm10, %xmm0 + pand %xmm7, %xmm10 + pandn %xmm2, %xmm0 + movdqa %xmm12, %xmm2 + pcmpgtd %xmm9, %xmm2 + movdqa %xmm0, %xmm14 + movdqa %xmm4, %xmm0 + por %xmm10, %xmm14 + movdqa %xmm9, %xmm10 + movdqa %xmm2, %xmm7 + pand %xmm2, %xmm10 + pandn %xmm12, %xmm7 + por %xmm7, %xmm10 + movdqa %xmm2, %xmm7 + pand %xmm12, %xmm2 + pandn %xmm9, %xmm7 + por %xmm7, %xmm2 + movdqa %xmm2, %xmm12 + movdqa %xmm6, %xmm2 + pcmpgtd %xmm4, %xmm2 + pand %xmm2, %xmm0 + movdqa %xmm2, %xmm7 + pandn %xmm6, %xmm7 + movdqa %xmm0, %xmm9 + movdqa %xmm11, %xmm0 + por %xmm7, %xmm9 + movdqa %xmm2, %xmm7 + pand %xmm6, %xmm2 + pandn %xmm4, %xmm7 + movdqa -160(%rbp), %xmm4 + por %xmm2, %xmm7 + pcmpgtd %xmm4, %xmm0 + movdqa %xmm4, %xmm6 + movdqa %xmm0, %xmm2 + pand %xmm0, %xmm6 + pandn %xmm11, %xmm2 + por %xmm2, %xmm6 + movdqa %xmm0, %xmm2 + pand %xmm11, %xmm0 + pandn %xmm4, %xmm2 + movaps %xmm6, -288(%rbp) + movdqa -112(%rbp), %xmm6 + por %xmm2, %xmm0 + movdqa %xmm0, %xmm11 + movdqa %xmm5, %xmm0 + pcmpgtd %xmm6, %xmm0 + movdqa %xmm0, %xmm2 + pand %xmm0, %xmm6 + movdqa %xmm0, %xmm4 + pandn %xmm5, %xmm2 + pand %xmm5, %xmm0 + movdqa %xmm13, %xmm5 + pcmpgtd %xmm15, %xmm5 + pandn -112(%rbp), %xmm4 + por %xmm2, %xmm6 + por %xmm4, %xmm0 + movaps %xmm0, -304(%rbp) + movdqa %xmm5, %xmm2 + movdqa %xmm5, %xmm0 + movdqa %xmm15, %xmm5 + pandn %xmm13, %xmm2 + pand %xmm0, %xmm5 + por %xmm2, %xmm5 + movdqa %xmm0, %xmm2 + pand %xmm13, %xmm0 + movaps %xmm5, -320(%rbp) + movdqa -64(%rbp), %xmm5 + pandn %xmm15, %xmm2 + movdqa -224(%rbp), %xmm15 + movdqa %xmm0, %xmm13 + movdqa %xmm5, %xmm4 + por %xmm2, %xmm13 + pcmpgtd %xmm15, %xmm4 + movdqa %xmm4, %xmm2 + movdqa %xmm4, %xmm0 + movdqa %xmm5, %xmm4 + pandn %xmm5, %xmm2 + movdqa %xmm15, %xmm5 + pand %xmm0, %xmm5 + por %xmm2, %xmm5 + movdqa %xmm0, %xmm2 + pand %xmm4, %xmm0 + pandn %xmm15, %xmm2 + movdqa %xmm0, %xmm15 + pshufd $177, %xmm8, %xmm0 + movaps %xmm5, -336(%rbp) + por %xmm2, %xmm15 + movdqa %xmm0, %xmm2 + pcmpgtd %xmm8, %xmm2 + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm5 + pandn %xmm0, %xmm4 + pandn %xmm8, %xmm5 + pand %xmm2, %xmm0 + pand %xmm2, %xmm8 + por %xmm5, %xmm0 + por %xmm4, %xmm8 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm8, %xmm8 + punpckldq %xmm0, %xmm8 + pshufd $177, %xmm1, %xmm0 + movdqa %xmm0, %xmm2 + movaps %xmm8, -352(%rbp) + pcmpgtd %xmm1, %xmm2 + movaps %xmm8, -112(%rbp) + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm8 + pandn %xmm0, %xmm4 + pandn %xmm1, %xmm8 + pand %xmm2, %xmm0 + pand %xmm2, %xmm1 + por %xmm8, %xmm0 + por %xmm4, %xmm1 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm1, %xmm1 + punpckldq %xmm0, %xmm1 + pshufd $177, %xmm3, %xmm0 + movaps %xmm1, -368(%rbp) + movaps %xmm1, -128(%rbp) + movdqa %xmm0, %xmm1 + pcmpgtd %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm4 + pandn %xmm0, %xmm2 + pandn %xmm3, %xmm4 + pand %xmm1, %xmm0 + pand %xmm1, %xmm3 + por %xmm4, %xmm0 + por %xmm2, %xmm3 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm3, %xmm3 + punpckldq %xmm0, %xmm3 + pshufd $177, %xmm14, %xmm0 + movdqa %xmm0, %xmm1 + movaps %xmm3, -384(%rbp) + pcmpgtd %xmm14, %xmm1 + movaps %xmm3, -144(%rbp) + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm14, %xmm3 + pand %xmm1, %xmm0 + pand %xmm1, %xmm14 + por %xmm3, %xmm0 + por %xmm2, %xmm14 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm14, %xmm14 + punpckldq %xmm0, %xmm14 + pshufd $177, %xmm10, %xmm0 + movdqa %xmm0, %xmm1 + movaps %xmm14, -160(%rbp) + cmpq $3, -240(%rbp) + pcmpgtd %xmm10, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm10, %xmm3 + pand %xmm1, %xmm0 + pand %xmm1, %xmm10 + por %xmm3, %xmm0 + por %xmm2, %xmm10 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm10, %xmm10 + punpckldq %xmm0, %xmm10 + pshufd $177, %xmm12, %xmm0 + movdqa %xmm0, %xmm1 + movaps %xmm10, -176(%rbp) + pcmpgtd %xmm12, %xmm1 + movaps %xmm10, -64(%rbp) + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm12, %xmm3 + pand %xmm1, %xmm0 + pand %xmm1, %xmm12 + por %xmm3, %xmm0 + por %xmm2, %xmm12 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm12, %xmm12 + punpckldq %xmm0, %xmm12 + pshufd $177, %xmm9, %xmm0 + movdqa %xmm0, %xmm1 + movaps %xmm12, -192(%rbp) + pcmpgtd %xmm9, %xmm1 + movaps %xmm12, -80(%rbp) + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm9, %xmm3 + pand %xmm1, %xmm0 + pand %xmm1, %xmm9 + por %xmm3, %xmm0 + por %xmm2, %xmm9 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm9, %xmm9 + punpckldq %xmm0, %xmm9 + pshufd $177, %xmm7, %xmm0 + movdqa %xmm0, %xmm1 + movaps %xmm9, -208(%rbp) + pcmpgtd %xmm7, %xmm1 + movaps %xmm9, -96(%rbp) + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm7, %xmm3 + pand %xmm1, %xmm7 + por %xmm2, %xmm7 + pand %xmm1, %xmm0 + pshufd $136, %xmm7, %xmm7 + por %xmm3, %xmm0 + movdqa %xmm7, %xmm3 + movdqa -288(%rbp), %xmm7 + pshufd $221, %xmm0, %xmm0 + punpckldq %xmm0, %xmm3 + pshufd $177, %xmm7, %xmm0 + movaps %xmm3, -224(%rbp) + movdqa %xmm3, %xmm12 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm7, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm7, %xmm3 + pand %xmm1, %xmm0 + pand %xmm7, %xmm1 + por %xmm3, %xmm0 + por %xmm2, %xmm1 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm1, %xmm1 + punpckldq %xmm0, %xmm1 + pshufd $177, %xmm11, %xmm0 + movdqa %xmm1, %xmm8 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm11, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm11, %xmm3 + pand %xmm1, %xmm11 + pand %xmm1, %xmm0 + por %xmm2, %xmm11 + por %xmm3, %xmm0 + pshufd $136, %xmm11, %xmm11 + pshufd $221, %xmm0, %xmm0 + movdqa %xmm11, %xmm7 + punpckldq %xmm0, %xmm7 + pshufd $177, %xmm6, %xmm0 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm6, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm6, %xmm3 + pand %xmm1, %xmm6 + por %xmm2, %xmm6 + pand %xmm1, %xmm0 + pshufd $136, %xmm6, %xmm6 + por %xmm3, %xmm0 + movdqa %xmm6, %xmm10 + movdqa -304(%rbp), %xmm6 + pshufd $221, %xmm0, %xmm0 + punpckldq %xmm0, %xmm10 + pshufd $177, %xmm6, %xmm0 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm6, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm6, %xmm3 + pand %xmm1, %xmm0 + pand %xmm6, %xmm1 + por %xmm3, %xmm0 + movdqa -320(%rbp), %xmm6 + por %xmm2, %xmm1 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm1, %xmm1 + punpckldq %xmm0, %xmm1 + movdqa %xmm1, %xmm5 + pshufd $177, %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm6, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm1, %xmm2 + pandn %xmm6, %xmm3 + pand %xmm0, %xmm1 + pand %xmm6, %xmm0 + por %xmm3, %xmm1 + movdqa -336(%rbp), %xmm6 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm13, %xmm1 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm13, %xmm2 + movdqa %xmm2, %xmm3 + movdqa %xmm2, %xmm4 + pandn %xmm1, %xmm3 + pandn %xmm13, %xmm4 + pand %xmm2, %xmm1 + pand %xmm2, %xmm13 + por %xmm4, %xmm1 + pshufd $177, %xmm6, %xmm2 + por %xmm3, %xmm13 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm13, %xmm13 + punpckldq %xmm1, %xmm13 + movdqa %xmm2, %xmm1 + pcmpgtd %xmm6, %xmm1 + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm4 + pandn %xmm2, %xmm3 + pandn %xmm6, %xmm4 + pand %xmm1, %xmm2 + pand %xmm6, %xmm1 + por %xmm4, %xmm2 + por %xmm3, %xmm1 + pshufd $221, %xmm2, %xmm2 + pshufd $177, %xmm15, %xmm3 + pshufd $136, %xmm1, %xmm1 + punpckldq %xmm2, %xmm1 + movdqa %xmm3, %xmm2 + pcmpgtd %xmm15, %xmm2 + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm6 + pandn %xmm3, %xmm4 + pandn %xmm15, %xmm6 + pand %xmm2, %xmm3 + pand %xmm2, %xmm15 + por %xmm6, %xmm3 + por %xmm4, %xmm15 + pshufd $221, %xmm3, %xmm3 + pshufd $136, %xmm15, %xmm2 + punpckldq %xmm3, %xmm2 + jbe .L113 + pshufd $27, %xmm2, %xmm2 + pshufd $27, %xmm13, %xmm3 + pshufd $27, %xmm1, %xmm4 + movdqa -352(%rbp), %xmm15 + movdqa %xmm2, %xmm9 + pshufd $27, %xmm10, %xmm6 + movdqa %xmm4, %xmm10 + movaps %xmm4, -144(%rbp) + pcmpgtd %xmm15, %xmm9 + movdqa %xmm15, %xmm13 + movdqa %xmm3, %xmm4 + movaps %xmm3, -64(%rbp) + pshufd $27, %xmm0, %xmm0 + pshufd $27, %xmm5, %xmm5 + pshufd $27, %xmm7, %xmm7 + pshufd $27, %xmm8, %xmm8 + movdqa %xmm9, %xmm1 + pand %xmm9, %xmm13 + pandn %xmm2, %xmm1 + por %xmm1, %xmm13 + movdqa %xmm9, %xmm1 + pand %xmm2, %xmm9 + pandn %xmm15, %xmm1 + movdqa -368(%rbp), %xmm15 + movaps %xmm1, -240(%rbp) + pcmpgtd %xmm15, %xmm10 + movdqa %xmm15, %xmm12 + movdqa %xmm10, %xmm11 + movdqa %xmm10, %xmm1 + pand %xmm10, %xmm12 + pandn %xmm15, %xmm11 + pandn -144(%rbp), %xmm1 + movaps %xmm11, -288(%rbp) + movdqa -384(%rbp), %xmm11 + por %xmm1, %xmm12 + pcmpgtd %xmm11, %xmm4 + movdqa %xmm11, %xmm15 + movdqa %xmm4, %xmm1 + pandn -64(%rbp), %xmm1 + pand %xmm4, %xmm15 + por %xmm1, %xmm15 + movdqa %xmm4, %xmm1 + pandn %xmm11, %xmm1 + movaps %xmm15, -80(%rbp) + movaps %xmm1, -304(%rbp) + movdqa %xmm0, %xmm1 + pcmpgtd %xmm14, %xmm1 + movdqa %xmm1, %xmm15 + movdqa %xmm1, %xmm11 + pandn %xmm0, %xmm15 + pand %xmm14, %xmm11 + por %xmm15, %xmm11 + movdqa %xmm1, %xmm15 + pand %xmm0, %xmm1 + pandn %xmm14, %xmm15 + movdqa %xmm5, %xmm14 + movaps %xmm11, -96(%rbp) + movaps %xmm15, -320(%rbp) + por -320(%rbp), %xmm1 + movdqa -176(%rbp), %xmm15 + pcmpgtd %xmm15, %xmm14 + pshufd $27, %xmm1, %xmm1 + movdqa -192(%rbp), %xmm15 + movdqa %xmm14, %xmm3 + pandn %xmm5, %xmm14 + movdqa %xmm3, %xmm11 + pand %xmm3, %xmm5 + pandn -176(%rbp), %xmm11 + movaps %xmm14, -336(%rbp) + pand -176(%rbp), %xmm3 + por -336(%rbp), %xmm3 + por %xmm11, %xmm5 + movaps %xmm5, -112(%rbp) + movdqa %xmm6, %xmm5 + pcmpgtd %xmm15, %xmm5 + movdqa -224(%rbp), %xmm15 + movdqa %xmm5, %xmm11 + pandn %xmm6, %xmm11 + pand %xmm5, %xmm6 + movaps %xmm11, -352(%rbp) + movdqa %xmm5, %xmm11 + pandn -192(%rbp), %xmm11 + pand -192(%rbp), %xmm5 + por -352(%rbp), %xmm5 + por %xmm11, %xmm6 + movdqa %xmm7, %xmm11 + movaps %xmm6, -128(%rbp) + movdqa -208(%rbp), %xmm6 + pshufd $27, %xmm5, %xmm5 + pcmpgtd %xmm6, %xmm11 + movdqa %xmm8, %xmm6 + pcmpgtd %xmm15, %xmm6 + pshufd $27, %xmm3, %xmm15 + movdqa %xmm11, %xmm14 + pandn %xmm7, %xmm14 + pand %xmm11, %xmm7 + movaps %xmm14, -368(%rbp) + movdqa %xmm11, %xmm14 + pandn -208(%rbp), %xmm14 + por %xmm14, %xmm7 + movdqa %xmm6, %xmm14 + pandn %xmm8, %xmm14 + movaps %xmm7, -160(%rbp) + pand %xmm6, %xmm8 + movdqa %xmm6, %xmm7 + pandn -224(%rbp), %xmm7 + pand -64(%rbp), %xmm4 + pand -224(%rbp), %xmm6 + pand -144(%rbp), %xmm10 + por -288(%rbp), %xmm10 + por %xmm7, %xmm8 + por -240(%rbp), %xmm9 + por %xmm14, %xmm6 + por -304(%rbp), %xmm4 + pand -208(%rbp), %xmm11 + pshufd $27, %xmm6, %xmm6 + pshufd $27, %xmm10, %xmm3 + pshufd $27, %xmm9, %xmm2 + movdqa -160(%rbp), %xmm14 + movdqa %xmm6, %xmm10 + movdqa %xmm13, %xmm9 + movaps %xmm2, -208(%rbp) + por -368(%rbp), %xmm11 + pcmpgtd %xmm13, %xmm10 + movaps %xmm3, -64(%rbp) + pshufd $27, %xmm4, %xmm4 + pshufd $27, %xmm11, %xmm11 + movdqa %xmm10, %xmm7 + movdqa %xmm10, %xmm0 + pand %xmm10, %xmm9 + pandn %xmm13, %xmm7 + movdqa %xmm2, %xmm13 + pandn %xmm6, %xmm0 + pcmpgtd %xmm8, %xmm13 + por %xmm0, %xmm9 + movdqa %xmm8, %xmm2 + movaps %xmm7, -288(%rbp) + movdqa %xmm14, %xmm7 + movaps %xmm9, -224(%rbp) + pand %xmm6, %xmm10 + movdqa %xmm13, %xmm0 + pandn -208(%rbp), %xmm0 + pand %xmm13, %xmm2 + por %xmm0, %xmm2 + movdqa %xmm13, %xmm0 + pandn %xmm8, %xmm0 + movaps %xmm2, -240(%rbp) + movaps %xmm0, -304(%rbp) + movdqa %xmm11, %xmm0 + pcmpgtd %xmm12, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm8 + pandn %xmm11, %xmm3 + pandn %xmm12, %xmm8 + movdqa %xmm3, %xmm2 + movdqa %xmm12, %xmm3 + movdqa -64(%rbp), %xmm12 + movaps %xmm8, -320(%rbp) + pand %xmm0, %xmm3 + movdqa -128(%rbp), %xmm8 + pand %xmm11, %xmm0 + por %xmm2, %xmm3 + movaps %xmm3, -144(%rbp) + movdqa %xmm12, %xmm3 + pcmpgtd %xmm14, %xmm3 + pand %xmm3, %xmm7 + movdqa %xmm3, %xmm2 + pandn -64(%rbp), %xmm2 + movdqa %xmm7, %xmm12 + movdqa %xmm3, %xmm7 + pandn %xmm14, %xmm7 + movdqa -80(%rbp), %xmm14 + por %xmm2, %xmm12 + movdqa %xmm5, %xmm2 + movaps %xmm12, -160(%rbp) + pcmpgtd %xmm14, %xmm2 + movaps %xmm7, -336(%rbp) + movdqa %xmm2, %xmm12 + movdqa %xmm2, %xmm7 + pandn %xmm5, %xmm12 + pandn %xmm14, %xmm7 + pand %xmm2, %xmm5 + por %xmm7, %xmm5 + movdqa %xmm4, %xmm7 + pand -80(%rbp), %xmm2 + pcmpgtd %xmm8, %xmm7 + movaps %xmm5, -176(%rbp) + por %xmm12, %xmm2 + pshufd $27, %xmm2, %xmm2 + movdqa %xmm7, %xmm5 + pandn %xmm4, %xmm5 + pand %xmm7, %xmm4 + movaps %xmm5, -352(%rbp) + movdqa %xmm7, %xmm5 + movdqa %xmm4, %xmm14 + movdqa %xmm15, %xmm4 + pandn %xmm8, %xmm5 + por %xmm5, %xmm14 + movdqa -96(%rbp), %xmm5 + movaps %xmm14, -192(%rbp) + pcmpgtd %xmm5, %xmm4 + movdqa %xmm4, %xmm8 + pandn %xmm15, %xmm4 + movdqa %xmm8, %xmm14 + pand %xmm8, %xmm15 + pand -96(%rbp), %xmm8 + pandn %xmm5, %xmm14 + movdqa %xmm1, %xmm5 + por %xmm14, %xmm15 + movdqa -112(%rbp), %xmm14 + por %xmm4, %xmm8 + pshufd $27, %xmm8, %xmm8 + pcmpgtd %xmm14, %xmm5 + movdqa %xmm5, %xmm9 + pandn %xmm1, %xmm9 + pand %xmm5, %xmm1 + movaps %xmm9, -368(%rbp) + por -320(%rbp), %xmm0 + movdqa %xmm5, %xmm9 + movdqa -128(%rbp), %xmm4 + pand -112(%rbp), %xmm5 + pandn %xmm14, %xmm9 + pand -64(%rbp), %xmm3 + pand %xmm7, %xmm4 + por -368(%rbp), %xmm5 + pand -208(%rbp), %xmm13 + por -304(%rbp), %xmm13 + por %xmm9, %xmm1 + movdqa -224(%rbp), %xmm6 + pshufd $27, %xmm0, %xmm0 + pshufd $27, %xmm5, %xmm7 + por -336(%rbp), %xmm3 + por -288(%rbp), %xmm10 + movdqa %xmm7, %xmm14 + pshufd $27, %xmm13, %xmm7 + movdqa %xmm2, %xmm13 + movaps %xmm7, -64(%rbp) + movdqa %xmm8, %xmm7 + pshufd $27, %xmm10, %xmm10 + por -352(%rbp), %xmm4 + pcmpgtd %xmm6, %xmm7 + movdqa %xmm10, %xmm11 + movdqa %xmm6, %xmm10 + movaps %xmm14, -224(%rbp) + movaps %xmm11, -208(%rbp) + pshufd $27, %xmm4, %xmm4 + pshufd $27, %xmm3, %xmm3 + movdqa %xmm7, %xmm9 + movdqa %xmm7, %xmm5 + pand %xmm7, %xmm10 + pandn %xmm6, %xmm9 + pandn %xmm8, %xmm5 + pand %xmm8, %xmm7 + movdqa -144(%rbp), %xmm6 + por %xmm5, %xmm10 + movaps %xmm9, -288(%rbp) + movdqa %xmm15, %xmm9 + pcmpgtd %xmm6, %xmm13 + movdqa %xmm13, %xmm5 + pandn %xmm2, %xmm5 + pand %xmm13, %xmm2 + movaps %xmm5, -304(%rbp) + movdqa %xmm13, %xmm5 + pandn %xmm6, %xmm5 + por %xmm5, %xmm2 + movdqa %xmm11, %xmm5 + movdqa -176(%rbp), %xmm11 + pcmpgtd %xmm15, %xmm5 + movdqa %xmm5, %xmm6 + pandn -208(%rbp), %xmm5 + movdqa %xmm6, %xmm12 + pand %xmm6, %xmm9 + pandn %xmm15, %xmm12 + por %xmm5, %xmm9 + movdqa -240(%rbp), %xmm15 + movaps %xmm12, -320(%rbp) + movdqa %xmm0, %xmm12 + pcmpgtd %xmm11, %xmm12 + movdqa %xmm12, %xmm5 + pandn %xmm0, %xmm5 + pand %xmm12, %xmm0 + movaps %xmm5, -336(%rbp) + movdqa %xmm12, %xmm5 + pandn %xmm11, %xmm5 + por %xmm5, %xmm0 + movdqa %xmm14, %xmm5 + movdqa %xmm15, %xmm14 + pcmpgtd %xmm15, %xmm5 + movaps %xmm0, -80(%rbp) + movdqa %xmm5, %xmm11 + pand %xmm5, %xmm14 + pandn -224(%rbp), %xmm11 + por %xmm11, %xmm14 + movdqa %xmm5, %xmm11 + pandn %xmm15, %xmm11 + movaps %xmm14, -96(%rbp) + movdqa -160(%rbp), %xmm15 + movaps %xmm11, -240(%rbp) + movdqa %xmm4, %xmm11 + pcmpgtd %xmm15, %xmm11 + movdqa %xmm11, %xmm14 + pandn %xmm4, %xmm14 + pand %xmm11, %xmm4 + movaps %xmm14, -352(%rbp) + movdqa %xmm11, %xmm14 + pandn %xmm15, %xmm14 + movdqa -64(%rbp), %xmm15 + por %xmm14, %xmm4 + movaps %xmm4, -112(%rbp) + movdqa %xmm15, %xmm4 + movdqa %xmm1, %xmm15 + pcmpgtd %xmm1, %xmm4 + movdqa %xmm4, %xmm14 + pandn -64(%rbp), %xmm14 + pand %xmm4, %xmm15 + por %xmm14, %xmm15 + movdqa %xmm4, %xmm14 + pandn %xmm1, %xmm14 + movdqa %xmm3, %xmm1 + movaps %xmm14, -368(%rbp) + movdqa -192(%rbp), %xmm14 + por -288(%rbp), %xmm7 + movdqa -160(%rbp), %xmm8 + pand -208(%rbp), %xmm6 + pcmpgtd %xmm14, %xmm1 + pshufd $27, %xmm7, %xmm7 + pand -176(%rbp), %xmm12 + pand %xmm11, %xmm8 + por -320(%rbp), %xmm6 + movdqa -192(%rbp), %xmm11 + por -336(%rbp), %xmm12 + pand -224(%rbp), %xmm5 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm11 + pshufd $27, %xmm6, %xmm6 + pandn %xmm3, %xmm0 + pand %xmm1, %xmm3 + pshufd $27, %xmm12, %xmm12 + movaps %xmm0, -384(%rbp) + movdqa %xmm1, %xmm0 + pand -64(%rbp), %xmm4 + por -352(%rbp), %xmm8 + pandn %xmm14, %xmm0 + por -240(%rbp), %xmm5 + por -384(%rbp), %xmm11 + por %xmm0, %xmm3 + pshufd $27, %xmm8, %xmm8 + por -368(%rbp), %xmm4 + movaps %xmm3, -128(%rbp) + pshufd $27, %xmm5, %xmm5 + pshufd $27, %xmm11, %xmm11 + movdqa -144(%rbp), %xmm3 + pshufd $27, %xmm4, %xmm4 + pand %xmm13, %xmm3 + por -304(%rbp), %xmm3 + movdqa %xmm10, %xmm13 + pshufd $27, %xmm3, %xmm3 + movdqa %xmm3, %xmm1 + pcmpgtd %xmm10, %xmm1 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm13 + pandn %xmm3, %xmm0 + pand %xmm1, %xmm3 + por %xmm0, %xmm13 + movdqa %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm2, %xmm1 + pandn %xmm10, %xmm0 + movdqa %xmm2, %xmm10 + por %xmm0, %xmm3 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm10 + pandn %xmm7, %xmm0 + por %xmm0, %xmm10 + movdqa %xmm1, %xmm0 + pandn %xmm2, %xmm0 + movdqa %xmm1, %xmm2 + movdqa %xmm12, %xmm1 + pcmpgtd %xmm9, %xmm1 + pand %xmm7, %xmm2 + por %xmm0, %xmm2 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm0 + pandn %xmm12, %xmm7 + pandn %xmm9, %xmm0 + movdqa %xmm7, %xmm14 + movdqa %xmm9, %xmm7 + movdqa %xmm1, %xmm9 + pand %xmm12, %xmm9 + movdqa -80(%rbp), %xmm12 + pand %xmm1, %xmm7 + movdqa %xmm6, %xmm1 + por %xmm0, %xmm9 + por %xmm14, %xmm7 + pcmpgtd %xmm12, %xmm1 + movdqa %xmm1, %xmm0 + pandn %xmm6, %xmm0 + movdqa %xmm0, %xmm14 + movdqa %xmm12, %xmm0 + pand %xmm1, %xmm0 + movdqa %xmm0, %xmm12 + movdqa %xmm1, %xmm0 + pandn -80(%rbp), %xmm0 + por %xmm14, %xmm12 + movdqa %xmm0, %xmm14 + movdqa %xmm1, %xmm0 + movdqa %xmm8, %xmm1 + pand %xmm6, %xmm0 + por %xmm14, %xmm0 + movdqa -96(%rbp), %xmm14 + movaps %xmm0, -64(%rbp) + pcmpgtd %xmm14, %xmm1 + movdqa %xmm1, %xmm6 + pand %xmm1, %xmm14 + movdqa %xmm1, %xmm0 + pandn %xmm8, %xmm6 + pand %xmm8, %xmm1 + movdqa -112(%rbp), %xmm8 + pandn -96(%rbp), %xmm0 + por %xmm6, %xmm14 + movdqa %xmm5, %xmm6 + pcmpgtd %xmm8, %xmm6 + por %xmm0, %xmm1 + movaps %xmm1, -80(%rbp) + movdqa %xmm6, %xmm1 + pandn %xmm5, %xmm6 + pand %xmm1, %xmm8 + movdqa %xmm1, %xmm0 + pand %xmm5, %xmm1 + por %xmm6, %xmm8 + movdqa %xmm11, %xmm6 + pandn -112(%rbp), %xmm0 + pcmpgtd %xmm15, %xmm6 + movdqa %xmm1, %xmm5 + por %xmm0, %xmm5 + movaps %xmm5, -96(%rbp) + movdqa %xmm6, %xmm1 + movdqa %xmm6, %xmm5 + movdqa %xmm15, %xmm6 + pandn %xmm11, %xmm5 + pand %xmm1, %xmm6 + por %xmm5, %xmm6 + movdqa %xmm1, %xmm5 + pand %xmm11, %xmm1 + pandn %xmm15, %xmm5 + movdqa -128(%rbp), %xmm15 + movdqa %xmm1, %xmm11 + movdqa %xmm4, %xmm1 + movaps %xmm6, -112(%rbp) + por %xmm5, %xmm11 + pcmpgtd %xmm15, %xmm1 + movdqa %xmm15, %xmm6 + pand %xmm1, %xmm6 + movdqa %xmm1, %xmm5 + pandn %xmm4, %xmm5 + movdqa %xmm6, %xmm0 + por %xmm5, %xmm0 + movdqa %xmm1, %xmm5 + pand %xmm4, %xmm1 + pshufd $27, %xmm13, %xmm4 + movdqa %xmm0, %xmm15 + pandn -128(%rbp), %xmm5 + movdqa %xmm4, %xmm0 + pcmpgtd %xmm13, %xmm0 + por %xmm5, %xmm1 + movaps %xmm1, -128(%rbp) + movdqa %xmm0, %xmm6 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm5 + pandn %xmm4, %xmm6 + pand %xmm4, %xmm1 + pshufd $27, %xmm3, %xmm4 + pandn %xmm13, %xmm5 + pand %xmm0, %xmm13 + movdqa %xmm4, %xmm0 + pcmpgtd %xmm3, %xmm0 + por %xmm5, %xmm1 + por %xmm6, %xmm13 + shufpd $2, %xmm1, %xmm13 + movdqa %xmm0, %xmm5 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm6 + pandn %xmm3, %xmm5 + pand %xmm4, %xmm1 + pandn %xmm4, %xmm6 + por %xmm5, %xmm1 + pand %xmm0, %xmm3 + por %xmm6, %xmm3 + movapd %xmm1, %xmm6 + movsd %xmm3, %xmm6 + pshufd $27, %xmm10, %xmm3 + movdqa %xmm3, %xmm0 + pcmpgtd %xmm10, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm5 + pandn %xmm10, %xmm0 + pandn %xmm3, %xmm5 + pand %xmm1, %xmm10 + pand %xmm3, %xmm1 + pshufd $27, %xmm2, %xmm3 + por %xmm0, %xmm1 + por %xmm5, %xmm10 + movdqa %xmm3, %xmm0 + shufpd $2, %xmm1, %xmm10 + pcmpgtd %xmm2, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm5 + pandn %xmm2, %xmm0 + pand %xmm1, %xmm2 + pand %xmm3, %xmm1 + pandn %xmm3, %xmm5 + por %xmm0, %xmm1 + por %xmm5, %xmm2 + movdqa -64(%rbp), %xmm0 + movapd %xmm1, %xmm3 + movsd %xmm2, %xmm3 + pshufd $27, %xmm7, %xmm2 + movapd %xmm3, %xmm5 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm7, %xmm3 + movdqa %xmm3, %xmm1 + pandn %xmm2, %xmm3 + movdqa %xmm3, %xmm4 + movdqa %xmm1, %xmm3 + pandn %xmm7, %xmm3 + pand %xmm1, %xmm7 + pand %xmm2, %xmm1 + pshufd $27, %xmm9, %xmm2 + por %xmm3, %xmm1 + por %xmm4, %xmm7 + movdqa %xmm2, %xmm3 + shufpd $2, %xmm1, %xmm7 + pcmpgtd %xmm9, %xmm3 + movdqa %xmm3, %xmm1 + pandn %xmm2, %xmm3 + movdqa %xmm3, %xmm4 + movdqa %xmm1, %xmm3 + pandn %xmm9, %xmm3 + pand %xmm1, %xmm9 + pand %xmm2, %xmm1 + pshufd $27, %xmm12, %xmm2 + por %xmm3, %xmm1 + por %xmm4, %xmm9 + movdqa %xmm2, %xmm3 + shufpd $2, %xmm1, %xmm9 + pcmpgtd %xmm12, %xmm3 + movdqa %xmm3, %xmm1 + pandn %xmm2, %xmm3 + movdqa %xmm3, %xmm4 + movdqa %xmm1, %xmm3 + pandn %xmm12, %xmm3 + pand %xmm1, %xmm12 + pand %xmm2, %xmm1 + pshufd $27, %xmm0, %xmm2 + por %xmm3, %xmm1 + por %xmm4, %xmm12 + movdqa %xmm2, %xmm3 + shufpd $2, %xmm1, %xmm12 + pcmpgtd %xmm0, %xmm3 + movdqa %xmm3, %xmm1 + pandn %xmm2, %xmm3 + movdqa %xmm3, %xmm4 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm3 + pand %xmm1, %xmm0 + pand %xmm2, %xmm1 + por %xmm3, %xmm1 + por %xmm4, %xmm0 + movapd %xmm1, %xmm3 + pshufd $27, %xmm14, %xmm1 + movsd %xmm0, %xmm3 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm14, %xmm0 + movaps %xmm3, -176(%rbp) + movdqa -80(%rbp), %xmm4 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + pandn %xmm1, %xmm3 + pandn %xmm14, %xmm2 + pand %xmm0, %xmm14 + pand %xmm1, %xmm0 + por %xmm3, %xmm14 + pshufd $27, %xmm4, %xmm1 + por %xmm2, %xmm0 + shufpd $2, %xmm0, %xmm14 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm4, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm4, %xmm2 + pand %xmm0, %xmm4 + pand %xmm1, %xmm0 + pandn %xmm1, %xmm3 + por %xmm2, %xmm0 + pshufd $27, %xmm8, %xmm1 + movapd %xmm0, %xmm2 + movdqa %xmm1, %xmm0 + por %xmm3, %xmm4 + pcmpgtd %xmm8, %xmm0 + movsd %xmm4, %xmm2 + movdqa -96(%rbp), %xmm4 + movaps %xmm2, -192(%rbp) + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + pandn %xmm1, %xmm3 + pandn %xmm8, %xmm2 + pand %xmm0, %xmm8 + pand %xmm1, %xmm0 + por %xmm3, %xmm8 + pshufd $27, %xmm4, %xmm1 + por %xmm2, %xmm0 + shufpd $2, %xmm0, %xmm8 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm4, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + pandn %xmm1, %xmm3 + pandn %xmm4, %xmm2 + pand %xmm0, %xmm4 + pand %xmm1, %xmm0 + por %xmm3, %xmm4 + por %xmm2, %xmm0 + movsd %xmm4, %xmm0 + movdqa -112(%rbp), %xmm4 + movaps %xmm0, -208(%rbp) + pshufd $27, %xmm4, %xmm1 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm4, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm4, %xmm2 + pand %xmm0, %xmm4 + pand %xmm1, %xmm0 + pandn %xmm1, %xmm3 + por %xmm2, %xmm0 + pshufd $27, %xmm11, %xmm1 + por %xmm3, %xmm4 + movapd %xmm0, %xmm3 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm11, %xmm0 + movsd %xmm4, %xmm3 + movdqa -128(%rbp), %xmm4 + movaps %xmm3, -224(%rbp) + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm1, %xmm3 + pandn %xmm11, %xmm2 + pand %xmm0, %xmm11 + pand %xmm1, %xmm0 + por %xmm3, %xmm11 + pshufd $27, %xmm15, %xmm1 + por %xmm2, %xmm0 + shufpd $2, %xmm0, %xmm11 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm15, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + pandn %xmm1, %xmm3 + pandn %xmm15, %xmm2 + pand %xmm0, %xmm15 + pand %xmm1, %xmm0 + por %xmm3, %xmm15 + pshufd $27, %xmm4, %xmm1 + por %xmm2, %xmm0 + shufpd $2, %xmm0, %xmm15 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm4, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + pandn %xmm1, %xmm3 + pandn %xmm4, %xmm2 + pand %xmm0, %xmm4 + pand %xmm1, %xmm0 + por %xmm3, %xmm4 + pshufd $177, %xmm13, %xmm1 + por %xmm2, %xmm0 + shufpd $2, %xmm0, %xmm4 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm13, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm1, %xmm2 + pandn %xmm13, %xmm3 + pand %xmm0, %xmm1 + pand %xmm13, %xmm0 + por %xmm3, %xmm1 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm6, %xmm1 + movaps %xmm0, -112(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm6, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm13 + pandn %xmm1, %xmm2 + pandn %xmm6, %xmm13 + pand %xmm0, %xmm1 + pand %xmm6, %xmm0 + por %xmm13, %xmm1 + movapd -224(%rbp), %xmm6 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm10, %xmm1 + movaps %xmm0, -128(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm10, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm13 + pandn %xmm1, %xmm2 + pandn %xmm10, %xmm13 + pand %xmm0, %xmm1 + pand %xmm10, %xmm0 + por %xmm13, %xmm1 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm5, %xmm1 + movaps %xmm0, -144(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm5, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm10 + pandn %xmm1, %xmm2 + pandn %xmm5, %xmm10 + pand %xmm0, %xmm1 + pand %xmm5, %xmm0 + por %xmm10, %xmm1 + movapd -208(%rbp), %xmm5 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm7, %xmm1 + movaps %xmm0, -160(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm7, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm10 + pandn %xmm1, %xmm2 + pandn %xmm7, %xmm10 + pand %xmm0, %xmm1 + pand %xmm7, %xmm0 + por %xmm10, %xmm1 + movapd -176(%rbp), %xmm7 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm9, %xmm1 + movaps %xmm0, -64(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm9, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm10 + pandn %xmm1, %xmm2 + pandn %xmm9, %xmm10 + pand %xmm0, %xmm1 + pand %xmm9, %xmm0 + por %xmm10, %xmm1 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm12, %xmm1 + movaps %xmm0, -80(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm12, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm10 + pandn %xmm1, %xmm2 + pandn %xmm12, %xmm10 + pand %xmm0, %xmm1 + pand %xmm12, %xmm0 + por %xmm10, %xmm1 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm7, %xmm1 + movaps %xmm0, -96(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm7, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm1, %xmm2 + pandn %xmm7, %xmm3 + pand %xmm0, %xmm1 + pand %xmm7, %xmm0 + por %xmm3, %xmm1 + movapd -192(%rbp), %xmm7 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + movdqa %xmm0, %xmm12 + pshufd $177, %xmm14, %xmm0 + movdqa %xmm0, %xmm3 + pcmpgtd %xmm14, %xmm3 + movdqa %xmm3, %xmm1 + movdqa %xmm3, %xmm2 + pandn %xmm0, %xmm1 + pandn %xmm14, %xmm2 + pand %xmm3, %xmm0 + pand %xmm14, %xmm3 + por %xmm2, %xmm0 + por %xmm1, %xmm3 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm3, %xmm3 + punpckldq %xmm0, %xmm3 + pshufd $177, %xmm7, %xmm0 + movdqa %xmm0, %xmm9 + pcmpgtd %xmm7, %xmm9 + movdqa %xmm9, %xmm1 + movdqa %xmm9, %xmm2 + pandn %xmm0, %xmm1 + pandn %xmm7, %xmm2 + pand %xmm9, %xmm0 + pand %xmm7, %xmm9 + por %xmm2, %xmm0 + por %xmm1, %xmm9 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm9, %xmm9 + punpckldq %xmm0, %xmm9 + pshufd $177, %xmm8, %xmm0 + movdqa %xmm0, %xmm7 + pcmpgtd %xmm8, %xmm7 + movdqa %xmm7, %xmm1 + movdqa %xmm7, %xmm2 + pandn %xmm0, %xmm1 + pandn %xmm8, %xmm2 + pand %xmm7, %xmm0 + pand %xmm8, %xmm7 + por %xmm2, %xmm0 + por %xmm1, %xmm7 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm7, %xmm7 + punpckldq %xmm0, %xmm7 + pshufd $177, %xmm5, %xmm0 + movdqa %xmm0, %xmm10 + pcmpgtd %xmm5, %xmm10 + movdqa %xmm10, %xmm1 + movdqa %xmm10, %xmm2 + pandn %xmm0, %xmm1 + pandn %xmm5, %xmm2 + pand %xmm10, %xmm0 + pand %xmm5, %xmm10 + por %xmm2, %xmm0 + por %xmm1, %xmm10 + pshufd $221, %xmm0, %xmm0 + pshufd $177, %xmm6, %xmm1 + pshufd $136, %xmm10, %xmm10 + punpckldq %xmm0, %xmm10 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm6, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm5 + pandn %xmm1, %xmm2 + pandn %xmm6, %xmm5 + pand %xmm0, %xmm1 + pand %xmm6, %xmm0 + por %xmm5, %xmm1 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm11, %xmm1 + movdqa %xmm1, %xmm13 + pcmpgtd %xmm11, %xmm13 + movdqa %xmm13, %xmm2 + movdqa %xmm13, %xmm5 + pandn %xmm1, %xmm2 + pandn %xmm11, %xmm5 + pand %xmm13, %xmm1 + pand %xmm11, %xmm13 + por %xmm5, %xmm1 + por %xmm2, %xmm13 + pshufd $221, %xmm1, %xmm1 + pshufd $177, %xmm15, %xmm2 + pshufd $136, %xmm13, %xmm13 + punpckldq %xmm1, %xmm13 + movdqa %xmm2, %xmm1 + pcmpgtd %xmm15, %xmm1 + movdqa %xmm1, %xmm5 + movdqa %xmm1, %xmm6 + pandn %xmm2, %xmm5 + pandn %xmm15, %xmm6 + pand %xmm1, %xmm2 + pand %xmm15, %xmm1 + por %xmm6, %xmm2 + por %xmm5, %xmm1 + pshufd $221, %xmm2, %xmm2 + pshufd $177, %xmm4, %xmm5 + pshufd $136, %xmm1, %xmm1 + punpckldq %xmm2, %xmm1 + movdqa %xmm5, %xmm2 + pcmpgtd %xmm4, %xmm2 + movdqa %xmm2, %xmm6 + movdqa %xmm2, %xmm8 + pandn %xmm5, %xmm6 + pandn %xmm4, %xmm8 + pand %xmm2, %xmm5 + pand %xmm4, %xmm2 + por %xmm8, %xmm5 + por %xmm6, %xmm2 + pshufd $221, %xmm5, %xmm5 + pshufd $136, %xmm2, %xmm2 + punpckldq %xmm5, %xmm2 + movdqa %xmm2, %xmm15 +.L109: + movdqa -112(%rbp), %xmm4 + movq -264(%rbp), %rdx + movups %xmm4, (%rdx) + movdqa -128(%rbp), %xmm4 + movups %xmm4, (%r15) + movdqa -144(%rbp), %xmm4 + movups %xmm4, (%r14) + movdqa -160(%rbp), %xmm4 + movups %xmm4, 0(%r13) + movdqa -64(%rbp), %xmm4 + movups %xmm4, (%r12) + movdqa -80(%rbp), %xmm4 + movups %xmm4, (%rbx) + movdqa -96(%rbp), %xmm4 + movq -248(%rbp), %rbx + movups %xmm4, (%r11) + movups %xmm12, (%r10) + movups %xmm3, (%r9) + movups %xmm9, (%r8) + movups %xmm7, (%rdi) + movups %xmm10, (%rsi) + movups %xmm0, (%rcx) + movq -256(%rbp), %rcx + movups %xmm13, (%rbx) + movups %xmm1, (%rcx) + movups %xmm15, (%rax) + addq $240, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L112: + .cfi_restore_state + movdqa -192(%rbp), %xmm15 + movdqa -208(%rbp), %xmm1 + movdqa %xmm2, %xmm9 + jmp .L109 + .p2align 4,,10 + .p2align 3 +.L113: + movdqa %xmm7, %xmm9 + movdqa %xmm8, %xmm3 + movdqa %xmm10, %xmm7 + movdqa %xmm2, %xmm15 + movdqa %xmm5, %xmm10 + jmp .L109 + .cfi_endproc +.LFE18783: + .size _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0, .-_ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + .section .text._ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, @function +_ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0: +.LFB18784: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsi, %r10 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %rbx + .cfi_offset 3, -24 + cmpq $15, %rsi + jbe .L130 + vmovdqa32 %zmm0, %zmm2 + vmovdqa32 %zmm1, %zmm3 + movl $16, %r8d + xorl %esi, %esi + jmp .L121 + .p2align 4,,10 + .p2align 3 +.L116: + vmovdqu64 %zmm0, (%rax) + kmovw %k0, %eax + popcntq %rax, %rax + addq %rax, %rsi + leaq 16(%r8), %rax + cmpq %r10, %rax + ja .L141 + movq %rax, %r8 +.L121: + vmovdqu32 -64(%rdi,%r8,4), %zmm4 + leaq -16(%r8), %r9 + leaq (%rdi,%rsi,4), %rax + vpcmpd $0, %zmm2, %zmm4, %k0 + vpcmpd $0, %zmm3, %zmm4, %k1 + kmovw %k0, %r11d + kmovw %k1, %ebx + korw %k1, %k0, %k1 + kortestw %k1, %k1 + jc .L116 + kmovw %r11d, %k6 + kmovw %ebx, %k5 + kxnorw %k5, %k6, %k7 + kmovw %k7, %eax + tzcntl %eax, %eax + addq %r9, %rax + vpbroadcastd (%rdi,%rax,4), %zmm0 + leaq 16(%rsi), %rax + vmovdqa32 %zmm0, (%rdx) + cmpq %r9, %rax + ja .L117 + .p2align 4,,10 + .p2align 3 +.L118: + vmovdqu64 %zmm1, -64(%rdi,%rax,4) + movq %rax, %rsi + addq $16, %rax + cmpq %rax, %r9 + jnb .L118 +.L117: + subq %rsi, %r9 + leaq (%rdi,%rsi,4), %rdx + movl $65535, %eax + cmpq $255, %r9 + jbe .L142 +.L119: + kmovw %eax, %k4 + xorl %eax, %eax + vmovdqu32 %zmm1, (%rdx){%k4} +.L114: + movq -8(%rbp), %rbx + leave + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L142: + .cfi_restore_state + movq $-1, %rax + bzhi %r9, %rax, %rax + movzwl %ax, %eax + jmp .L119 + .p2align 4,,10 + .p2align 3 +.L141: + movq %r10, %r11 + leaq (%rdi,%r8,4), %rbx + leaq (%rdi,%rsi,4), %r9 + movl $65535, %eax + subq %r8, %r11 + kmovd %eax, %k1 + cmpq $255, %r11 + jbe .L115 +.L122: + vmovdqu32 (%rbx), %zmm2{%k1}{z} + knotw %k1, %k3 + vmovdqu32 %zmm2, (%rcx){%k1} + vmovdqa32 (%rcx), %zmm2 + vpcmpd $0, %zmm0, %zmm2, %k0 + vpcmpd $0, %zmm1, %zmm2, %k2 + kandw %k1, %k0, %k0 + korw %k2, %k0, %k2 + korw %k3, %k2, %k2 + kortestw %k2, %k2 + jnc .L143 + kmovw %k0, %edx + popcntq %rdx, %rdx + addq %rsi, %rdx + vmovdqu32 %zmm0, (%r9){%k1} + leaq 16(%rdx), %rax + cmpq %r10, %rax + ja .L127 + .p2align 4,,10 + .p2align 3 +.L128: + vmovdqu64 %zmm1, -64(%rdi,%rax,4) + movq %rax, %rdx + addq $16, %rax + cmpq %rax, %r10 + jnb .L128 +.L127: + subq %rdx, %r10 + leaq (%rdi,%rdx,4), %rcx + movl $65535, %eax + cmpq $255, %r10 + ja .L129 + movq $-1, %rax + bzhi %r10, %rax, %rax + movzwl %ax, %eax +.L129: + kmovw %eax, %k5 + movl $1, %eax + vmovdqu32 %zmm1, (%rcx){%k5} + movq -8(%rbp), %rbx + leave + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret +.L130: + .cfi_restore_state + movq %rsi, %r11 + movq %rdi, %r9 + movq %rdi, %rbx + xorl %esi, %esi + xorl %r8d, %r8d +.L115: + movq $-1, %rax + bzhi %r11, %rax, %rax + movzwl %ax, %eax + kmovd %eax, %k1 + jmp .L122 +.L143: + knotw %k2, %k3 + kmovw %k3, %eax + tzcntl %eax, %eax + addq %r8, %rax + vpbroadcastd (%rdi,%rax,4), %zmm0 + leaq 16(%rsi), %rax + vmovdqa32 %zmm0, (%rdx) + cmpq %r8, %rax + ja .L124 + .p2align 4,,10 + .p2align 3 +.L125: + vmovdqu64 %zmm1, -64(%rdi,%rax,4) + movq %rax, %rsi + leaq 16(%rax), %rax + cmpq %rax, %r8 + jnb .L125 + leaq (%rdi,%rsi,4), %r9 +.L124: + subq %rsi, %r8 + movl $65535, %eax + cmpq $255, %r8 + ja .L126 + movq $-1, %rax + bzhi %r8, %rax, %rax + movzwl %ax, %eax +.L126: + kmovw %eax, %k6 + xorl %eax, %eax + vmovdqu32 %zmm1, (%r9){%k6} + jmp .L114 + .cfi_endproc +.LFE18784: + .size _ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, .-_ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + .section .text._ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0, @function +_ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0: +.LFB18785: + .cfi_startproc + movq %rsi, %r8 + movq %rdx, %rcx + cmpq %rdx, %rsi + jbe .L154 + leaq (%rdx,%rdx), %rdx + leaq 1(%rcx), %r10 + leaq 1(%rdx), %rsi + addq $2, %rdx + cmpq %rsi, %r8 + jbe .L154 + movl (%rdi,%rcx,4), %r11d + vpbroadcastd %r11d, %xmm1 + jmp .L147 + .p2align 4,,10 + .p2align 3 +.L157: + movq %rsi, %rax + cmpq %rdx, %r8 + ja .L155 +.L149: + cmpq %rcx, %rax + je .L154 + leaq (%rdi,%rax,4), %rdx + movl (%rdx), %ecx + movl %ecx, (%r9) + movl %r11d, (%rdx) + cmpq %rax, %r8 + jbe .L156 + leaq (%rax,%rax), %rdx + leaq 1(%rax), %r10 + leaq 1(%rdx), %rsi + addq $2, %rdx + cmpq %r8, %rsi + jnb .L154 + movq %rax, %rcx +.L147: + vpbroadcastd (%rdi,%rsi,4), %xmm0 + leaq (%rdi,%rcx,4), %r9 + vpcmpd $6, %xmm1, %xmm0, %k0 + kmovb %k0, %eax + testb $1, %al + jne .L157 + cmpq %rdx, %r8 + jbe .L154 + vpbroadcastd (%rdi,%r10,8), %xmm0 + vpcmpd $6, %xmm1, %xmm0, %k1 + kmovb %k1, %eax + testb $1, %al + je .L154 + movq %rdx, %rax + jmp .L149 + .p2align 4,,10 + .p2align 3 +.L154: + ret + .p2align 4,,10 + .p2align 3 +.L155: + vpbroadcastd (%rdi,%r10,8), %xmm2 + vpcmpd $6, %xmm0, %xmm2, %k2 + kmovb %k2, %esi + andl $1, %esi + cmovne %rdx, %rax + jmp .L149 + .p2align 4,,10 + .p2align 3 +.L156: + ret + .cfi_endproc +.LFE18785: + .size _ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0, .-_ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + .section .text._ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0, @function +_ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0: +.LFB18786: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + leaq 0(,%rsi,4), %rax + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + leaq (%rdi,%rax), %r15 + pushq %r14 + .cfi_offset 14, -32 + leaq (%r15,%rax), %r14 + pushq %r13 + .cfi_offset 13, -40 + leaq (%r14,%rax), %r13 + pushq %r12 + .cfi_offset 12, -48 + leaq 0(%r13,%rax), %r12 + pushq %rbx + .cfi_offset 3, -56 + leaq (%r12,%rax), %rbx + leaq (%rbx,%rax), %r11 + leaq (%r11,%rax), %r10 + leaq (%r10,%rax), %r9 + andq $-64, %rsp + leaq (%r9,%rax), %r8 + subq $8, %rsp + leaq (%r8,%rax), %rdx + leaq (%rdx,%rax), %rcx + vmovq %rdx, %xmm26 + leaq (%rcx,%rax), %rdx + vmovq %rcx, %xmm25 + vmovdqu32 (%rdi), %zmm7 + vpminsd (%r15), %zmm7, %zmm15 + vmovq %rdx, %xmm27 + addq %rax, %rdx + vpmaxsd (%r15), %zmm7, %zmm0 + vmovdqu32 (%r14), %zmm7 + leaq (%rdx,%rax), %rcx + vpminsd 0(%r13), %zmm7, %zmm16 + addq %rcx, %rax + vmovq %rcx, %xmm24 + vmovq %xmm26, %rcx + movq %rax, (%rsp) + vmovq %xmm25, %rax + vpmaxsd 0(%r13), %zmm7, %zmm10 + vpminsd %zmm16, %zmm15, %zmm12 + vmovdqu32 (%r12), %zmm7 + vpminsd (%rbx), %zmm7, %zmm11 + vpmaxsd %zmm16, %zmm15, %zmm15 + vpmaxsd (%rbx), %zmm7, %zmm2 + vmovdqu32 (%r11), %zmm7 + vpminsd %zmm10, %zmm0, %zmm16 + vpminsd (%r10), %zmm7, %zmm8 + vpmaxsd (%r10), %zmm7, %zmm6 + vmovdqu32 (%r9), %zmm7 + vpminsd (%r8), %zmm7, %zmm1 + vpmaxsd %zmm10, %zmm0, %zmm0 + vpmaxsd (%r8), %zmm7, %zmm4 + vmovdqu32 (%rcx), %zmm7 + vpminsd %zmm8, %zmm11, %zmm10 + vpminsd (%rax), %zmm7, %zmm9 + vpmaxsd (%rax), %zmm7, %zmm13 + vmovq %xmm27, %rax + vmovdqu32 (%rax), %zmm7 + movq (%rsp), %rax + vpmaxsd %zmm8, %zmm11, %zmm11 + vpminsd %zmm6, %zmm2, %zmm8 + vpminsd (%rdx), %zmm7, %zmm3 + vpmaxsd (%rdx), %zmm7, %zmm5 + vmovdqu64 (%rax), %zmm7 + vmovq %xmm24, %rax + vpmaxsd %zmm6, %zmm2, %zmm2 + vpminsd %zmm9, %zmm1, %zmm6 + vpmaxsd %zmm9, %zmm1, %zmm1 + vpminsd %zmm13, %zmm4, %zmm9 + vmovdqa64 %zmm7, -120(%rsp) + vmovdqa32 -120(%rsp), %zmm7 + vpminsd (%rax), %zmm7, %zmm14 + vpmaxsd %zmm13, %zmm4, %zmm4 + vpmaxsd (%rax), %zmm7, %zmm7 + vpminsd %zmm14, %zmm3, %zmm13 + vpmaxsd %zmm14, %zmm3, %zmm3 + vpminsd %zmm7, %zmm5, %zmm14 + vpmaxsd %zmm7, %zmm5, %zmm5 + vpminsd %zmm10, %zmm12, %zmm7 + vpmaxsd %zmm10, %zmm12, %zmm12 + vpminsd %zmm8, %zmm16, %zmm10 + vpmaxsd %zmm8, %zmm16, %zmm16 + vpminsd %zmm11, %zmm15, %zmm8 + vpmaxsd %zmm11, %zmm15, %zmm15 + vpminsd %zmm2, %zmm0, %zmm11 + vpmaxsd %zmm2, %zmm0, %zmm0 + vpminsd %zmm13, %zmm6, %zmm2 + vpmaxsd %zmm13, %zmm6, %zmm6 + vpminsd %zmm14, %zmm9, %zmm13 + vpmaxsd %zmm14, %zmm9, %zmm9 + vpminsd %zmm3, %zmm1, %zmm14 + vpmaxsd %zmm3, %zmm1, %zmm1 + vpminsd %zmm5, %zmm4, %zmm3 + vpmaxsd %zmm5, %zmm4, %zmm4 + vpminsd %zmm2, %zmm7, %zmm5 + vpmaxsd %zmm2, %zmm7, %zmm7 + vpminsd %zmm13, %zmm10, %zmm2 + vpmaxsd %zmm13, %zmm10, %zmm10 + vpminsd %zmm14, %zmm8, %zmm13 + vpmaxsd %zmm14, %zmm8, %zmm8 + vpminsd %zmm3, %zmm11, %zmm14 + vpmaxsd %zmm3, %zmm11, %zmm11 + vpminsd %zmm6, %zmm12, %zmm3 + vpmaxsd %zmm6, %zmm12, %zmm12 + vpminsd %zmm9, %zmm16, %zmm6 + vpmaxsd %zmm9, %zmm16, %zmm16 + vpminsd %zmm1, %zmm15, %zmm9 + vpmaxsd %zmm1, %zmm15, %zmm15 + vpminsd %zmm4, %zmm0, %zmm1 + vpmaxsd %zmm4, %zmm0, %zmm0 + vpminsd %zmm8, %zmm6, %zmm4 + vpmaxsd %zmm8, %zmm6, %zmm6 + vpminsd %zmm10, %zmm9, %zmm8 + vpmaxsd %zmm10, %zmm9, %zmm9 + vpminsd %zmm12, %zmm14, %zmm10 + vpmaxsd %zmm12, %zmm14, %zmm14 + vpminsd %zmm11, %zmm1, %zmm12 + vpmaxsd %zmm11, %zmm1, %zmm1 + vpminsd %zmm15, %zmm16, %zmm11 + vpmaxsd %zmm15, %zmm16, %zmm16 + vpminsd %zmm7, %zmm3, %zmm15 + vpmaxsd %zmm7, %zmm3, %zmm3 + vpminsd %zmm13, %zmm2, %zmm7 + vpmaxsd %zmm13, %zmm2, %zmm2 + vpminsd %zmm15, %zmm7, %zmm13 + vpmaxsd %zmm15, %zmm7, %zmm7 + vpminsd %zmm11, %zmm12, %zmm15 + vpmaxsd %zmm11, %zmm12, %zmm12 + vpminsd %zmm3, %zmm2, %zmm11 + vpmaxsd %zmm3, %zmm2, %zmm2 + vpminsd %zmm16, %zmm1, %zmm3 + vpminsd %zmm7, %zmm11, %zmm18 + vpmaxsd %zmm7, %zmm11, %zmm11 + vpminsd %zmm8, %zmm4, %zmm7 + vpmaxsd %zmm8, %zmm4, %zmm4 + vpminsd %zmm6, %zmm9, %zmm8 + vpmaxsd %zmm6, %zmm9, %zmm9 + vpminsd %zmm12, %zmm3, %zmm6 + vpmaxsd %zmm12, %zmm3, %zmm3 + vpminsd %zmm2, %zmm10, %zmm12 + vpmaxsd %zmm2, %zmm10, %zmm10 + vpminsd %zmm14, %zmm15, %zmm2 + vpmaxsd %zmm14, %zmm15, %zmm15 + vpminsd %zmm7, %zmm12, %zmm14 + vpmaxsd %zmm7, %zmm12, %zmm12 + vpminsd %zmm10, %zmm4, %zmm7 + vpmaxsd %zmm10, %zmm4, %zmm4 + vpminsd %zmm8, %zmm2, %zmm10 + vpmaxsd %zmm8, %zmm2, %zmm2 + vpminsd %zmm15, %zmm9, %zmm8 + vpmaxsd %zmm16, %zmm1, %zmm1 + vpmaxsd %zmm15, %zmm9, %zmm9 + vpminsd %zmm7, %zmm12, %zmm16 + vpmaxsd %zmm7, %zmm12, %zmm12 + vpminsd %zmm4, %zmm10, %zmm7 + vpmaxsd %zmm4, %zmm10, %zmm10 + vpminsd %zmm8, %zmm2, %zmm4 + vpminsd %zmm7, %zmm12, %zmm15 + vpminsd %zmm11, %zmm14, %zmm17 + vpmaxsd %zmm8, %zmm2, %zmm2 + vpmaxsd %zmm7, %zmm12, %zmm12 + vpminsd %zmm9, %zmm6, %zmm8 + vpminsd %zmm4, %zmm10, %zmm7 + vpmaxsd %zmm11, %zmm14, %zmm14 + vpmaxsd %zmm9, %zmm6, %zmm6 + vpmaxsd %zmm4, %zmm10, %zmm10 + cmpq $1, %rsi + jbe .L160 + vpshufd $177, %zmm2, %zmm2 + vpshufd $177, %zmm0, %zmm0 + vpshufd $177, %zmm8, %zmm8 + movl $21845, %eax + vpminsd %zmm0, %zmm5, %zmm22 + vpshufd $177, %zmm3, %zmm3 + vpmaxsd %zmm0, %zmm5, %zmm9 + kmovw %eax, %k1 + vpminsd %zmm2, %zmm16, %zmm5 + vpshufd $177, %zmm10, %zmm10 + vpminsd %zmm3, %zmm18, %zmm20 + vpshufd $177, %zmm7, %zmm7 + vpshufd $177, %zmm1, %zmm1 + vpmaxsd %zmm3, %zmm18, %zmm0 + vpminsd %zmm8, %zmm14, %zmm11 + vpminsd %zmm10, %zmm15, %zmm4 + vpshufd $177, %zmm6, %zmm6 + vpshufd $177, %zmm5, %zmm5 + vpminsd %zmm1, %zmm13, %zmm21 + vpminsd %zmm6, %zmm17, %zmm19 + vpmaxsd %zmm2, %zmm16, %zmm16 + vpminsd %zmm7, %zmm12, %zmm3 + vpshufd $177, %zmm9, %zmm9 + vpminsd %zmm5, %zmm20, %zmm2 + vpmaxsd %zmm1, %zmm13, %zmm13 + vpshufd $177, %zmm11, %zmm11 + vpmaxsd %zmm7, %zmm12, %zmm1 + vpmaxsd %zmm6, %zmm17, %zmm6 + vpshufd $177, %zmm0, %zmm0 + vpshufd $177, %zmm4, %zmm4 + vpminsd %zmm9, %zmm1, %zmm18 + vpmaxsd %zmm8, %zmm14, %zmm14 + vpminsd %zmm4, %zmm21, %zmm12 + vpmaxsd %zmm10, %zmm15, %zmm10 + vpshufd $177, %zmm3, %zmm7 + vpshufd $177, %zmm2, %zmm2 + vpminsd %zmm11, %zmm19, %zmm3 + vpshufd $177, %zmm6, %zmm6 + vpshufd $177, %zmm13, %zmm13 + vpmaxsd %zmm9, %zmm1, %zmm1 + vpmaxsd %zmm4, %zmm21, %zmm4 + vpminsd %zmm0, %zmm16, %zmm9 + vpminsd %zmm7, %zmm22, %zmm17 + vpshufd $177, %zmm4, %zmm4 + vpminsd %zmm13, %zmm10, %zmm15 + vpmaxsd %zmm5, %zmm20, %zmm5 + vpshufd $177, %zmm3, %zmm3 + vpmaxsd %zmm0, %zmm16, %zmm0 + vpminsd %zmm6, %zmm14, %zmm8 + vpshufd $177, %zmm9, %zmm16 + vpmaxsd %zmm6, %zmm14, %zmm6 + vpminsd %zmm2, %zmm12, %zmm20 + vpshufd $177, %zmm1, %zmm1 + vpmaxsd %zmm7, %zmm22, %zmm7 + vpmaxsd %zmm13, %zmm10, %zmm13 + vpshufd $177, %zmm8, %zmm8 + vpminsd %zmm3, %zmm17, %zmm14 + vpmaxsd %zmm11, %zmm19, %zmm11 + vpshufd $177, %zmm13, %zmm13 + vpmaxsd %zmm2, %zmm12, %zmm12 + vpminsd %zmm1, %zmm6, %zmm19 + vpshufd $177, %zmm20, %zmm2 + vpminsd %zmm4, %zmm5, %zmm9 + vpshufd $177, %zmm7, %zmm7 + vpmaxsd %zmm4, %zmm5, %zmm5 + vpmaxsd %zmm1, %zmm6, %zmm1 + vpminsd %zmm16, %zmm15, %zmm4 + vpshufd $177, %zmm9, %zmm9 + vpminsd %zmm7, %zmm11, %zmm10 + vpshufd $177, %zmm4, %zmm4 + vpshufd $177, %zmm1, %zmm1 + vpmaxsd %zmm3, %zmm17, %zmm3 + vpmaxsd %zmm7, %zmm11, %zmm7 + vpminsd %zmm8, %zmm18, %zmm17 + vpmaxsd %zmm8, %zmm18, %zmm8 + vpshufd $177, %zmm7, %zmm7 + vpminsd %zmm13, %zmm0, %zmm18 + vpmaxsd %zmm13, %zmm0, %zmm0 + vpshufd $177, %zmm3, %zmm3 + vpminsd %zmm2, %zmm14, %zmm13 + vpminsd %zmm4, %zmm17, %zmm23 + vpshufd $177, %zmm18, %zmm18 + vpminsd %zmm1, %zmm0, %zmm20 + vpmaxsd %zmm16, %zmm15, %zmm16 + vpshufd $177, %zmm8, %zmm8 + vpminsd %zmm9, %zmm10, %zmm15 + vpmaxsd %zmm9, %zmm10, %zmm10 + vpmaxsd %zmm4, %zmm17, %zmm9 + vpmaxsd %zmm1, %zmm0, %zmm4 + vpshufd $177, %zmm13, %zmm0 + vpmaxsd %zmm2, %zmm14, %zmm11 + vpminsd %zmm3, %zmm12, %zmm14 + vpmaxsd %zmm7, %zmm5, %zmm2 + vpmaxsd %zmm3, %zmm12, %zmm3 + vpminsd %zmm7, %zmm5, %zmm12 + vpmaxsd %zmm0, %zmm13, %zmm5 + vpminsd %zmm0, %zmm13, %zmm5{%k1} + vpshufd $177, %zmm11, %zmm0 + vpminsd %zmm18, %zmm19, %zmm21 + vpmaxsd %zmm0, %zmm11, %zmm13 + vpmaxsd %zmm18, %zmm19, %zmm19 + vpminsd %zmm0, %zmm11, %zmm13{%k1} + vpshufd $177, %zmm14, %zmm0 + vpminsd %zmm8, %zmm16, %zmm22 + vpmaxsd %zmm0, %zmm14, %zmm18 + vpmaxsd %zmm8, %zmm16, %zmm6 + vpshufd $177, %zmm22, %zmm1 + vpminsd %zmm0, %zmm14, %zmm18{%k1} + vpshufd $177, %zmm3, %zmm0 + vpmaxsd %zmm0, %zmm3, %zmm17 + vpminsd %zmm0, %zmm3, %zmm17{%k1} + vpshufd $177, %zmm15, %zmm0 + vpshufd $177, %zmm4, %zmm3 + vpmaxsd %zmm0, %zmm15, %zmm14 + vpminsd %zmm0, %zmm15, %zmm14{%k1} + vpshufd $177, %zmm10, %zmm0 + vpmaxsd %zmm0, %zmm10, %zmm16 + vpminsd %zmm0, %zmm10, %zmm16{%k1} + vpshufd $177, %zmm12, %zmm0 + vpmaxsd %zmm0, %zmm12, %zmm15 + vpminsd %zmm0, %zmm12, %zmm15{%k1} + vpshufd $177, %zmm2, %zmm0 + vpmaxsd %zmm0, %zmm2, %zmm12 + vpminsd %zmm0, %zmm2, %zmm12{%k1} + vpshufd $177, %zmm23, %zmm0 + vpmaxsd %zmm1, %zmm22, %zmm2 + vpmaxsd %zmm0, %zmm23, %zmm7 + vpminsd %zmm1, %zmm22, %zmm2{%k1} + vpminsd %zmm0, %zmm23, %zmm7{%k1} + vpshufd $177, %zmm9, %zmm0 + vpmaxsd %zmm0, %zmm9, %zmm10 + vpminsd %zmm0, %zmm9, %zmm10{%k1} + vpshufd $177, %zmm6, %zmm0 + vpmaxsd %zmm0, %zmm6, %zmm8 + vpminsd %zmm0, %zmm6, %zmm8{%k1} + vpshufd $177, %zmm21, %zmm0 + vpmaxsd %zmm0, %zmm21, %zmm6 + vpminsd %zmm0, %zmm21, %zmm6{%k1} + vpshufd $177, %zmm19, %zmm0 + vpmaxsd %zmm0, %zmm19, %zmm9 + vpminsd %zmm0, %zmm19, %zmm9{%k1} + vpshufd $177, %zmm20, %zmm0 + vpmaxsd %zmm0, %zmm20, %zmm1 + vpminsd %zmm0, %zmm20, %zmm1{%k1} + vpmaxsd %zmm3, %zmm4, %zmm0 + vpminsd %zmm3, %zmm4, %zmm0{%k1} + vmovdqa64 %zmm9, %zmm3 + cmpq $3, %rsi + jbe .L160 + vpshufd $27, %zmm2, %zmm2 + vpshufd $27, %zmm8, %zmm8 + vpshufd $27, %zmm9, %zmm9 + movl $85, %eax + vpminsd %zmm9, %zmm18, %zmm20 + vpshufd $27, %zmm7, %zmm7 + vpshufd $27, %zmm10, %zmm10 + kmovb %eax, %k2 + vpshufd $27, %zmm6, %zmm6 + vpshufd $27, %zmm1, %zmm1 + vpshufd $27, %zmm0, %zmm0 + vpmaxsd %zmm9, %zmm18, %zmm23 + vpmaxsd %zmm8, %zmm14, %zmm18 + vpminsd %zmm8, %zmm14, %zmm9 + vpminsd %zmm2, %zmm16, %zmm8 + vpminsd %zmm1, %zmm13, %zmm21 + vpminsd %zmm6, %zmm17, %zmm19 + vpshufd $27, %zmm8, %zmm8 + vpminsd %zmm0, %zmm5, %zmm22 + vpmaxsd %zmm0, %zmm5, %zmm4 + vpshufd $27, %zmm9, %zmm9 + vpmaxsd %zmm1, %zmm13, %zmm13 + vpmaxsd %zmm6, %zmm17, %zmm11 + vpshufd $27, %zmm4, %zmm4 + vpminsd %zmm10, %zmm15, %zmm5 + vpminsd %zmm7, %zmm12, %zmm6 + vpshufd $27, %zmm11, %zmm11 + vpmaxsd %zmm2, %zmm16, %zmm0 + vpmaxsd %zmm10, %zmm15, %zmm3 + vpshufd $27, %zmm13, %zmm2 + vpmaxsd %zmm7, %zmm12, %zmm1 + vpshufd $27, %zmm5, %zmm5 + vpshufd $27, %zmm23, %zmm12 + vpminsd %zmm8, %zmm20, %zmm7 + vpshufd $27, %zmm6, %zmm6 + vpminsd %zmm4, %zmm1, %zmm14 + vpmaxsd %zmm8, %zmm20, %zmm16 + vpminsd %zmm2, %zmm3, %zmm17 + vpshufd $27, %zmm7, %zmm7 + vpminsd %zmm12, %zmm0, %zmm10 + vpminsd %zmm6, %zmm22, %zmm13 + vpminsd %zmm9, %zmm19, %zmm8 + vpmaxsd %zmm4, %zmm1, %zmm1 + vpmaxsd %zmm2, %zmm3, %zmm3 + vpminsd %zmm5, %zmm21, %zmm4 + vpshufd $27, %zmm8, %zmm8 + vpmaxsd %zmm5, %zmm21, %zmm5 + vpmaxsd %zmm6, %zmm22, %zmm6 + vpshufd $27, %zmm3, %zmm3 + vpminsd %zmm7, %zmm4, %zmm20 + vpmaxsd %zmm12, %zmm0, %zmm0 + vpshufd $27, %zmm5, %zmm5 + vpmaxsd %zmm9, %zmm19, %zmm9 + vpminsd %zmm11, %zmm18, %zmm2 + vpshufd $27, %zmm6, %zmm6 + vpmaxsd %zmm11, %zmm18, %zmm11 + vpshufd $27, %zmm1, %zmm1 + vpshufd $27, %zmm10, %zmm18 + vpminsd %zmm8, %zmm13, %zmm12 + vpminsd %zmm6, %zmm9, %zmm15 + vpshufd $27, %zmm2, %zmm2 + vpminsd %zmm5, %zmm16, %zmm10 + vpmaxsd %zmm8, %zmm13, %zmm13 + vpmaxsd %zmm7, %zmm4, %zmm4 + vpminsd %zmm18, %zmm17, %zmm19 + vpminsd %zmm1, %zmm11, %zmm7 + vpmaxsd %zmm18, %zmm17, %zmm17 + vpshufd $27, %zmm19, %zmm19 + vpminsd %zmm3, %zmm0, %zmm18 + vpmaxsd %zmm6, %zmm9, %zmm9 + vpmaxsd %zmm1, %zmm11, %zmm1 + vpmaxsd %zmm5, %zmm16, %zmm6 + vpshufd $27, %zmm20, %zmm5 + vpminsd %zmm2, %zmm14, %zmm8 + vpshufd $27, %zmm13, %zmm11 + vpmaxsd %zmm3, %zmm0, %zmm0 + vpminsd %zmm5, %zmm12, %zmm13 + vpshufd $27, %zmm10, %zmm3 + vpmaxsd %zmm5, %zmm12, %zmm5 + vpshufd $27, %zmm9, %zmm9 + vpshufd $27, %zmm18, %zmm18 + vpshufd $27, %zmm1, %zmm1 + vpmaxsd %zmm2, %zmm14, %zmm2 + vpminsd %zmm19, %zmm8, %zmm16 + vpminsd %zmm3, %zmm15, %zmm12 + vpminsd %zmm9, %zmm6, %zmm10 + vpshufd $27, %zmm2, %zmm2 + vpmaxsd %zmm3, %zmm15, %zmm15 + vpminsd %zmm18, %zmm7, %zmm21 + vpmaxsd %zmm9, %zmm6, %zmm3 + vpmaxsd %zmm19, %zmm8, %zmm8 + vpmaxsd %zmm18, %zmm7, %zmm9 + vpminsd %zmm1, %zmm0, %zmm19 + vpshufd $27, %zmm5, %zmm7 + vpmaxsd %zmm1, %zmm0, %zmm0 + vpshufd $27, %zmm13, %zmm1 + vpminsd %zmm11, %zmm4, %zmm14 + vpmaxsd %zmm2, %zmm17, %zmm6 + vpmaxsd %zmm11, %zmm4, %zmm4 + vpminsd %zmm2, %zmm17, %zmm11 + vpminsd %zmm1, %zmm13, %zmm2 + vpmaxsd %zmm1, %zmm13, %zmm13 + vpminsd %zmm7, %zmm5, %zmm1 + vpmaxsd %zmm7, %zmm5, %zmm5 + vmovdqa64 %zmm2, %zmm13{%k2} + vpblendmq %zmm1, %zmm5, %zmm7{%k2} + vpshufd $27, %zmm14, %zmm1 + vpminsd %zmm1, %zmm14, %zmm2 + vpmaxsd %zmm1, %zmm14, %zmm14 + vmovdqa64 %zmm2, %zmm14{%k2} + vpshufd $27, %zmm4, %zmm2 + vpminsd %zmm2, %zmm4, %zmm1 + vpmaxsd %zmm2, %zmm4, %zmm2 + vmovdqa64 %zmm1, %zmm2{%k2} + vpshufd $27, %zmm12, %zmm1 + vpminsd %zmm1, %zmm12, %zmm4 + vpmaxsd %zmm1, %zmm12, %zmm12 + vmovdqa64 %zmm4, %zmm12{%k2} + vpshufd $27, %zmm15, %zmm4 + vpminsd %zmm4, %zmm15, %zmm1 + vpmaxsd %zmm4, %zmm15, %zmm4 + vmovdqa64 %zmm1, %zmm4{%k2} + vpshufd $27, %zmm10, %zmm1 + vpminsd %zmm1, %zmm10, %zmm5 + vpmaxsd %zmm1, %zmm10, %zmm10 + vpshufd $27, %zmm3, %zmm1 + vmovdqa64 %zmm5, %zmm10{%k2} + vpminsd %zmm1, %zmm3, %zmm5 + vpmaxsd %zmm1, %zmm3, %zmm1 + vpshufd $27, %zmm16, %zmm3 + vmovdqa64 %zmm5, %zmm1{%k2} + vpminsd %zmm3, %zmm16, %zmm5 + vpmaxsd %zmm3, %zmm16, %zmm3 + vmovdqa64 %zmm5, %zmm3{%k2} + vpshufd $27, %zmm8, %zmm5 + vpminsd %zmm5, %zmm8, %zmm15 + vpmaxsd %zmm5, %zmm8, %zmm8 + vpshufd $27, %zmm11, %zmm5 + vmovdqa64 %zmm15, %zmm8{%k2} + vpminsd %zmm5, %zmm11, %zmm15 + vpmaxsd %zmm5, %zmm11, %zmm11 + vpshufd $27, %zmm6, %zmm5 + vmovdqa64 %zmm15, %zmm11{%k2} + vpminsd %zmm5, %zmm6, %zmm15 + vpmaxsd %zmm5, %zmm6, %zmm6 + vpshufd $27, %zmm21, %zmm5 + vmovdqa64 %zmm15, %zmm6{%k2} + vpminsd %zmm5, %zmm21, %zmm15 + vpmaxsd %zmm5, %zmm21, %zmm21 + vpshufd $27, %zmm9, %zmm5 + vmovdqa64 %zmm15, %zmm21{%k2} + vpminsd %zmm5, %zmm9, %zmm15 + vpmaxsd %zmm5, %zmm9, %zmm9 + vpshufd $27, %zmm19, %zmm5 + vmovdqa64 %zmm15, %zmm9{%k2} + vpminsd %zmm5, %zmm19, %zmm15 + vpmaxsd %zmm5, %zmm19, %zmm19 + vpshufd $27, %zmm0, %zmm5 + vmovdqa64 %zmm15, %zmm19{%k2} + vpminsd %zmm5, %zmm0, %zmm20 + vpmaxsd %zmm5, %zmm0, %zmm0 + vpblendmq %zmm20, %zmm0, %zmm20{%k2} + vpshufd $177, %zmm13, %zmm0 + vpmaxsd %zmm13, %zmm0, %zmm5 + vpminsd %zmm13, %zmm0, %zmm5{%k1} + vpshufd $177, %zmm7, %zmm0 + vpmaxsd %zmm7, %zmm0, %zmm13 + vpminsd %zmm7, %zmm0, %zmm13{%k1} + vpshufd $177, %zmm14, %zmm0 + vpmaxsd %zmm14, %zmm0, %zmm18 + vpminsd %zmm14, %zmm0, %zmm18{%k1} + vpshufd $177, %zmm2, %zmm0 + vpmaxsd %zmm2, %zmm0, %zmm17 + vpminsd %zmm2, %zmm0, %zmm17{%k1} + vpshufd $177, %zmm12, %zmm0 + vpmaxsd %zmm12, %zmm0, %zmm14 + vpminsd %zmm12, %zmm0, %zmm14{%k1} + vpshufd $177, %zmm4, %zmm0 + vpmaxsd %zmm4, %zmm0, %zmm16 + vpminsd %zmm4, %zmm0, %zmm16{%k1} + vpshufd $177, %zmm10, %zmm0 + vpshufd $177, %zmm20, %zmm4 + vpmaxsd %zmm10, %zmm0, %zmm15 + vpminsd %zmm10, %zmm0, %zmm15{%k1} + vpshufd $177, %zmm1, %zmm0 + vpmaxsd %zmm1, %zmm0, %zmm12 + vpminsd %zmm1, %zmm0, %zmm12{%k1} + vpshufd $177, %zmm3, %zmm0 + vpshufd $177, %zmm11, %zmm1 + vpmaxsd %zmm3, %zmm0, %zmm7 + vpmaxsd %zmm11, %zmm1, %zmm2 + vpminsd %zmm3, %zmm0, %zmm7{%k1} + vpshufd $177, %zmm8, %zmm0 + vpminsd %zmm11, %zmm1, %zmm2{%k1} + vpmaxsd %zmm8, %zmm0, %zmm10 + vpshufd $177, %zmm9, %zmm1 + vpminsd %zmm8, %zmm0, %zmm10{%k1} + vpshufd $177, %zmm6, %zmm0 + vpmaxsd %zmm9, %zmm1, %zmm3 + vpmaxsd %zmm6, %zmm0, %zmm8 + vpminsd %zmm9, %zmm1, %zmm3{%k1} + vpminsd %zmm6, %zmm0, %zmm8{%k1} + vpshufd $177, %zmm21, %zmm0 + vpmaxsd %zmm21, %zmm0, %zmm6 + vpminsd %zmm21, %zmm0, %zmm6{%k1} + vpshufd $177, %zmm19, %zmm0 + vpmaxsd %zmm19, %zmm0, %zmm1 + vpminsd %zmm19, %zmm0, %zmm1{%k1} + vpmaxsd %zmm20, %zmm4, %zmm0 + vpminsd %zmm20, %zmm4, %zmm0{%k1} + cmpq $7, %rsi + jbe .L160 + vmovdqa32 .LC1(%rip), %zmm9 + movl $51, %eax + kmovb %eax, %k3 + vpermd %zmm2, %zmm9, %zmm2 + vpermd %zmm1, %zmm9, %zmm1 + vpermd %zmm7, %zmm9, %zmm7 + vpminsd %zmm1, %zmm13, %zmm21 + vpermd %zmm10, %zmm9, %zmm10 + vpermd %zmm8, %zmm9, %zmm8 + vpermd %zmm6, %zmm9, %zmm6 + vpermd %zmm3, %zmm9, %zmm3 + vpermd %zmm0, %zmm9, %zmm0 + vpminsd %zmm2, %zmm16, %zmm11 + vpmaxsd %zmm1, %zmm13, %zmm1 + vpminsd %zmm3, %zmm18, %zmm20 + vpminsd %zmm6, %zmm17, %zmm19 + vpermd %zmm11, %zmm9, %zmm11 + vpminsd %zmm0, %zmm5, %zmm22 + vpminsd %zmm7, %zmm12, %zmm4 + vpermd %zmm1, %zmm9, %zmm1 + vpmaxsd %zmm6, %zmm17, %zmm6 + vpmaxsd %zmm0, %zmm5, %zmm0 + vpminsd %zmm8, %zmm14, %zmm17 + vpminsd %zmm10, %zmm15, %zmm5 + vpermd %zmm0, %zmm9, %zmm0 + vpmaxsd %zmm10, %zmm15, %zmm10 + vpmaxsd %zmm3, %zmm18, %zmm3 + vpermd %zmm17, %zmm9, %zmm17 + vpminsd %zmm1, %zmm10, %zmm13 + vpmaxsd %zmm8, %zmm14, %zmm8 + vpermd %zmm3, %zmm9, %zmm3 + vpmaxsd %zmm2, %zmm16, %zmm2 + vpmaxsd %zmm7, %zmm12, %zmm7 + vpermd %zmm5, %zmm9, %zmm5 + vpermd %zmm6, %zmm9, %zmm12 + vpermd %zmm4, %zmm9, %zmm6 + vpmaxsd %zmm1, %zmm10, %zmm4 + vpminsd %zmm11, %zmm20, %zmm10 + vpminsd %zmm5, %zmm21, %zmm18 + vpermd %zmm4, %zmm9, %zmm4 + vpminsd %zmm0, %zmm7, %zmm16 + vpmaxsd %zmm11, %zmm20, %zmm1 + vpminsd %zmm6, %zmm22, %zmm14 + vpminsd %zmm3, %zmm2, %zmm15 + vpminsd %zmm17, %zmm19, %zmm11 + vpmaxsd %zmm0, %zmm7, %zmm7 + vpermd %zmm15, %zmm9, %zmm15 + vpmaxsd %zmm3, %zmm2, %zmm0 + vpmaxsd %zmm5, %zmm21, %zmm5 + vpermd %zmm11, %zmm9, %zmm11 + vpmaxsd %zmm17, %zmm19, %zmm3 + vpmaxsd %zmm12, %zmm8, %zmm2 + vpermd %zmm5, %zmm9, %zmm5 + vpminsd %zmm12, %zmm8, %zmm17 + vpmaxsd %zmm6, %zmm22, %zmm6 + vpermd %zmm10, %zmm9, %zmm8 + vpermd %zmm6, %zmm9, %zmm6 + vpermd %zmm17, %zmm9, %zmm17 + vpermd %zmm7, %zmm9, %zmm7 + vpminsd %zmm8, %zmm18, %zmm19 + vpminsd %zmm11, %zmm14, %zmm12 + vpmaxsd %zmm8, %zmm18, %zmm10 + vpmaxsd %zmm11, %zmm14, %zmm14 + vpminsd %zmm6, %zmm3, %zmm8 + vpmaxsd %zmm15, %zmm13, %zmm11 + vpminsd %zmm5, %zmm1, %zmm18 + vpmaxsd %zmm6, %zmm3, %zmm3 + vpmaxsd %zmm5, %zmm1, %zmm1 + vpminsd %zmm17, %zmm16, %zmm6 + vpermd %zmm19, %zmm9, %zmm5 + vpmaxsd %zmm17, %zmm16, %zmm16 + vpminsd %zmm15, %zmm13, %zmm17 + vpermd %zmm18, %zmm9, %zmm18 + vpminsd %zmm7, %zmm2, %zmm13 + vpmaxsd %zmm7, %zmm2, %zmm2 + vpermd %zmm17, %zmm9, %zmm17 + vpermd %zmm2, %zmm9, %zmm2 + vpminsd %zmm4, %zmm0, %zmm15 + vpmaxsd %zmm4, %zmm0, %zmm0 + vpermd %zmm14, %zmm9, %zmm4 + vpminsd %zmm5, %zmm12, %zmm14 + vpminsd %zmm17, %zmm6, %zmm20 + vpermd %zmm3, %zmm9, %zmm3 + vpermd %zmm15, %zmm9, %zmm15 + vpmaxsd %zmm5, %zmm12, %zmm5 + vpmaxsd %zmm17, %zmm6, %zmm6 + vpminsd %zmm2, %zmm0, %zmm17 + vpermd %zmm16, %zmm9, %zmm16 + vpmaxsd %zmm2, %zmm0, %zmm0 + vpermd %zmm14, %zmm9, %zmm2 + vpminsd %zmm4, %zmm10, %zmm12 + vpminsd %zmm18, %zmm8, %zmm7 + vpmaxsd %zmm4, %zmm10, %zmm4 + vpmaxsd %zmm18, %zmm8, %zmm8 + vpminsd %zmm3, %zmm1, %zmm10 + vpminsd %zmm15, %zmm13, %zmm18 + vpmaxsd %zmm3, %zmm1, %zmm1 + vpmaxsd %zmm15, %zmm13, %zmm3 + vpminsd %zmm2, %zmm14, %zmm13 + vpmaxsd %zmm2, %zmm14, %zmm14 + vpermd %zmm5, %zmm9, %zmm2 + vpminsd %zmm16, %zmm11, %zmm19 + vmovdqa64 %zmm13, %zmm14{%k3} + vpminsd %zmm2, %zmm5, %zmm13 + vpmaxsd %zmm2, %zmm5, %zmm5 + vpermd %zmm12, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm5{%k3} + vpmaxsd %zmm16, %zmm11, %zmm11 + vpminsd %zmm2, %zmm12, %zmm13 + vpmaxsd %zmm2, %zmm12, %zmm12 + vpermd %zmm4, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm12{%k3} + vpminsd %zmm2, %zmm4, %zmm13 + vpmaxsd %zmm2, %zmm4, %zmm4 + vpermd %zmm7, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm4{%k3} + vpshufd $78, %zmm5, %zmm16 + vpminsd %zmm2, %zmm7, %zmm13 + vpmaxsd %zmm2, %zmm7, %zmm7 + vpermd %zmm8, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm7{%k3} + vpminsd %zmm2, %zmm8, %zmm13 + vpmaxsd %zmm2, %zmm8, %zmm8 + vpermd %zmm10, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm8{%k3} + vpshufd $78, %zmm12, %zmm15 + vpminsd %zmm2, %zmm10, %zmm13 + vpmaxsd %zmm2, %zmm10, %zmm10 + vpermd %zmm1, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm10{%k3} + vpminsd %zmm2, %zmm1, %zmm13 + vpmaxsd %zmm2, %zmm1, %zmm1 + vpermd %zmm20, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm1{%k3} + vpminsd %zmm2, %zmm20, %zmm13 + vpmaxsd %zmm2, %zmm20, %zmm20 + vpermd %zmm6, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm20{%k3} + vpminsd %zmm2, %zmm6, %zmm13 + vpmaxsd %zmm2, %zmm6, %zmm6 + vpermd %zmm19, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm6{%k3} + vpminsd %zmm2, %zmm19, %zmm13 + vpmaxsd %zmm2, %zmm19, %zmm19 + vpermd %zmm11, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm19{%k3} + vpminsd %zmm2, %zmm11, %zmm13 + vpmaxsd %zmm2, %zmm11, %zmm2 + vpermd %zmm18, %zmm9, %zmm11 + vmovdqa64 %zmm13, %zmm2{%k3} + vpminsd %zmm11, %zmm18, %zmm13 + vpmaxsd %zmm11, %zmm18, %zmm18 + vpermd %zmm3, %zmm9, %zmm11 + vmovdqa64 %zmm13, %zmm18{%k3} + vpminsd %zmm11, %zmm3, %zmm13 + vpmaxsd %zmm11, %zmm3, %zmm3 + vpermd %zmm17, %zmm9, %zmm11 + vmovdqa64 %zmm13, %zmm3{%k3} + vpermd %zmm0, %zmm9, %zmm9 + vpminsd %zmm11, %zmm17, %zmm13 + vpmaxsd %zmm11, %zmm17, %zmm17 + vpshufd $78, %zmm2, %zmm21 + vmovdqa64 %zmm13, %zmm17{%k3} + vpshufd $78, %zmm14, %zmm13 + vpminsd %zmm9, %zmm0, %zmm11 + vpmaxsd %zmm9, %zmm0, %zmm0 + vpminsd %zmm14, %zmm13, %zmm9 + vpmaxsd %zmm14, %zmm13, %zmm13 + vpshufd $78, %zmm4, %zmm14 + vmovdqa64 %zmm11, %zmm0{%k3} + vmovdqa64 %zmm9, %zmm13{%k2} + vpminsd %zmm5, %zmm16, %zmm9 + vpmaxsd %zmm5, %zmm16, %zmm16 + vpminsd %zmm12, %zmm15, %zmm5 + vpmaxsd %zmm12, %zmm15, %zmm15 + vpshufd $78, %zmm7, %zmm12 + vmovdqa64 %zmm5, %zmm15{%k2} + vpshufd $78, %zmm8, %zmm11 + vpminsd %zmm4, %zmm14, %zmm5 + vpmaxsd %zmm4, %zmm14, %zmm14 + vpminsd %zmm7, %zmm12, %zmm4 + vmovdqa64 %zmm9, %zmm16{%k2} + vpmaxsd %zmm7, %zmm12, %zmm12 + vpshufd $78, %zmm10, %zmm7 + vmovdqa64 %zmm5, %zmm14{%k2} + vmovdqa64 %zmm4, %zmm12{%k2} + vpminsd %zmm8, %zmm11, %zmm4 + vpmaxsd %zmm8, %zmm11, %zmm11 + vmovdqa64 %zmm4, %zmm11{%k2} + vpminsd %zmm10, %zmm7, %zmm4 + vpmaxsd %zmm10, %zmm7, %zmm7 + vmovdqa64 %zmm4, %zmm7{%k2} + vpshufd $78, %zmm20, %zmm10 + vpshufd $78, %zmm1, %zmm4 + vpminsd %zmm1, %zmm4, %zmm5 + vpshufd $78, %zmm6, %zmm8 + vpmaxsd %zmm1, %zmm4, %zmm1 + vpminsd %zmm20, %zmm10, %zmm4 + vpmaxsd %zmm20, %zmm10, %zmm10 + vpshufd $78, %zmm18, %zmm20 + vmovdqa64 %zmm4, %zmm10{%k2} + vpminsd %zmm6, %zmm8, %zmm4 + vpmaxsd %zmm6, %zmm8, %zmm8 + vpshufd $78, %zmm19, %zmm6 + vmovdqa64 %zmm4, %zmm8{%k2} + vpshufd $78, %zmm0, %zmm9 + vpminsd %zmm19, %zmm6, %zmm4 + vpmaxsd %zmm19, %zmm6, %zmm6 + vpshufd $78, %zmm3, %zmm19 + vmovdqa64 %zmm4, %zmm6{%k2} + vpminsd %zmm2, %zmm21, %zmm4 + vpmaxsd %zmm2, %zmm21, %zmm21 + vpminsd %zmm18, %zmm20, %zmm2 + vpmaxsd %zmm18, %zmm20, %zmm20 + vmovdqa64 %zmm4, %zmm21{%k2} + vmovdqa64 %zmm2, %zmm20{%k2} + vpshufd $78, %zmm17, %zmm4 + vpminsd %zmm3, %zmm19, %zmm2 + vpmaxsd %zmm3, %zmm19, %zmm19 + vmovdqa64 %zmm5, %zmm1{%k2} + vmovdqa64 %zmm2, %zmm19{%k2} + vpminsd %zmm17, %zmm4, %zmm2 + vpmaxsd %zmm17, %zmm4, %zmm4 + vmovdqa64 %zmm2, %zmm4{%k2} + vpminsd %zmm0, %zmm9, %zmm2 + vpmaxsd %zmm0, %zmm9, %zmm9 + vpshufd $177, %zmm13, %zmm0 + vmovdqa64 %zmm2, %zmm9{%k2} + vpmaxsd %zmm13, %zmm0, %zmm5 + vpminsd %zmm13, %zmm0, %zmm5{%k1} + vpshufd $177, %zmm16, %zmm0 + vpmaxsd %zmm16, %zmm0, %zmm13 + vpminsd %zmm16, %zmm0, %zmm13{%k1} + vpshufd $177, %zmm15, %zmm0 + vpmaxsd %zmm15, %zmm0, %zmm18 + vpminsd %zmm15, %zmm0, %zmm18{%k1} + vpshufd $177, %zmm14, %zmm0 + vpmaxsd %zmm14, %zmm0, %zmm17 + vpminsd %zmm14, %zmm0, %zmm17{%k1} + vpshufd $177, %zmm12, %zmm0 + vpmaxsd %zmm12, %zmm0, %zmm14 + vpminsd %zmm12, %zmm0, %zmm14{%k1} + vpshufd $177, %zmm11, %zmm0 + vpmaxsd %zmm11, %zmm0, %zmm16 + vpminsd %zmm11, %zmm0, %zmm16{%k1} + vpshufd $177, %zmm7, %zmm0 + vpshufd $177, %zmm9, %zmm11 + vpmaxsd %zmm7, %zmm0, %zmm15 + vpminsd %zmm7, %zmm0, %zmm15{%k1} + vpshufd $177, %zmm1, %zmm0 + vpmaxsd %zmm1, %zmm0, %zmm12 + vpminsd %zmm1, %zmm0, %zmm12{%k1} + vpshufd $177, %zmm10, %zmm0 + vpshufd $177, %zmm19, %zmm1 + vpmaxsd %zmm10, %zmm0, %zmm7 + vpmaxsd %zmm19, %zmm1, %zmm3 + vpminsd %zmm10, %zmm0, %zmm7{%k1} + vpshufd $177, %zmm8, %zmm0 + vpminsd %zmm19, %zmm1, %zmm3{%k1} + vpmaxsd %zmm8, %zmm0, %zmm10 + vpminsd %zmm8, %zmm0, %zmm10{%k1} + vpshufd $177, %zmm6, %zmm0 + vpmaxsd %zmm6, %zmm0, %zmm2 + vpminsd %zmm6, %zmm0, %zmm2{%k1} + vpshufd $177, %zmm21, %zmm0 + vpmaxsd %zmm21, %zmm0, %zmm8 + vpminsd %zmm21, %zmm0, %zmm8{%k1} + vpshufd $177, %zmm20, %zmm0 + vpmaxsd %zmm20, %zmm0, %zmm6 + vpminsd %zmm20, %zmm0, %zmm6{%k1} + vpshufd $177, %zmm4, %zmm0 + vpmaxsd %zmm4, %zmm0, %zmm1 + vpminsd %zmm4, %zmm0, %zmm1{%k1} + vpmaxsd %zmm9, %zmm11, %zmm0 + vmovdqa32 %zmm0, %zmm4 + vpminsd %zmm9, %zmm11, %zmm4{%k1} + vmovdqa64 %zmm4, %zmm0 + cmpq $15, %rsi + jbe .L160 + vmovdqa32 .LC2(%rip), %zmm0 + movl $65535, %eax + kmovd %eax, %k1 + movl $51, %eax + vpermd %zmm6, %zmm0, %zmm11 + vpermd %zmm10, %zmm0, %zmm10 + vpermd %zmm4, %zmm0, %zmm6 + vpermd %zmm2, %zmm0, %zmm2 + vpermd %zmm1, %zmm0, %zmm1 + vpminsd %zmm6, %zmm5, %zmm20 + vpminsd %zmm1, %zmm13, %zmm19 + vpermd %zmm8, %zmm0, %zmm8 + vpermd %zmm7, %zmm0, %zmm7 + vpermd %zmm3, %zmm0, %zmm3 + vpmaxsd %zmm6, %zmm5, %zmm6 + vpmaxsd %zmm1, %zmm13, %zmm1 + vpminsd %zmm10, %zmm15, %zmm5 + vpminsd %zmm2, %zmm16, %zmm13 + vpermd %zmm6, %zmm0, %zmm6 + vpminsd %zmm3, %zmm18, %zmm9 + vpermd %zmm5, %zmm0, %zmm5 + vpminsd %zmm8, %zmm14, %zmm4 + vpmaxsd %zmm3, %zmm18, %zmm3 + vpminsd %zmm11, %zmm17, %zmm18 + vpermd %zmm1, %zmm0, %zmm1 + vpmaxsd %zmm11, %zmm17, %zmm11 + vpmaxsd %zmm8, %zmm14, %zmm17 + vpermd %zmm3, %zmm0, %zmm3 + vpminsd %zmm7, %zmm12, %zmm8 + vpmaxsd %zmm7, %zmm12, %zmm7 + vpermd %zmm13, %zmm0, %zmm12 + vpmaxsd %zmm10, %zmm15, %zmm10 + vpminsd %zmm6, %zmm7, %zmm13 + vpermd %zmm11, %zmm0, %zmm15 + vpminsd %zmm5, %zmm19, %zmm14 + vpmaxsd %zmm2, %zmm16, %zmm2 + vpermd %zmm4, %zmm0, %zmm4 + vpermd %zmm8, %zmm0, %zmm8 + vpmaxsd %zmm6, %zmm7, %zmm6 + vpmaxsd %zmm5, %zmm19, %zmm7 + vpminsd %zmm12, %zmm9, %zmm19 + vpminsd %zmm8, %zmm20, %zmm16 + vpermd %zmm7, %zmm0, %zmm7 + vpminsd %zmm1, %zmm10, %zmm11 + vpermd %zmm19, %zmm0, %zmm19 + vpmaxsd %zmm1, %zmm10, %zmm5 + vpmaxsd %zmm8, %zmm20, %zmm8 + vpmaxsd %zmm12, %zmm9, %zmm1 + vpermd %zmm5, %zmm0, %zmm5 + vpminsd %zmm3, %zmm2, %zmm12 + vpminsd %zmm4, %zmm18, %zmm9 + vpermd %zmm8, %zmm0, %zmm8 + vpmaxsd %zmm4, %zmm18, %zmm4 + vpminsd %zmm15, %zmm17, %zmm18 + vpermd %zmm9, %zmm0, %zmm9 + vpermd %zmm12, %zmm0, %zmm12 + vpermd %zmm18, %zmm0, %zmm18 + vpermd %zmm6, %zmm0, %zmm6 + vpmaxsd %zmm3, %zmm2, %zmm2 + vpmaxsd %zmm15, %zmm17, %zmm3 + vpminsd %zmm19, %zmm14, %zmm17 + vpminsd %zmm9, %zmm16, %zmm15 + vpmaxsd %zmm9, %zmm16, %zmm10 + vpmaxsd %zmm19, %zmm14, %zmm14 + vpminsd %zmm8, %zmm4, %zmm9 + vpminsd %zmm7, %zmm1, %zmm16 + vpminsd %zmm12, %zmm11, %zmm19 + vpmaxsd %zmm7, %zmm1, %zmm1 + vpermd %zmm16, %zmm0, %zmm16 + vpminsd %zmm18, %zmm13, %zmm7 + vpmaxsd %zmm12, %zmm11, %zmm11 + vpermd %zmm19, %zmm0, %zmm19 + vpminsd %zmm5, %zmm2, %zmm12 + vpmaxsd %zmm8, %zmm4, %zmm4 + vpmaxsd %zmm18, %zmm13, %zmm13 + vpminsd %zmm6, %zmm3, %zmm8 + vpermd %zmm4, %zmm0, %zmm4 + vpmaxsd %zmm6, %zmm3, %zmm3 + vpermd %zmm17, %zmm0, %zmm6 + vpermd %zmm13, %zmm0, %zmm13 + vpermd %zmm12, %zmm0, %zmm12 + vpermd %zmm3, %zmm0, %zmm3 + vpmaxsd %zmm5, %zmm2, %zmm2 + vpermd %zmm10, %zmm0, %zmm5 + vpminsd %zmm6, %zmm15, %zmm10 + vpminsd %zmm16, %zmm9, %zmm17 + vpminsd %zmm5, %zmm14, %zmm18 + vpmaxsd %zmm6, %zmm15, %zmm6 + vpmaxsd %zmm5, %zmm14, %zmm5 + vpmaxsd %zmm16, %zmm9, %zmm9 + vpminsd %zmm13, %zmm11, %zmm14 + vpminsd %zmm4, %zmm1, %zmm16 + vpmaxsd %zmm13, %zmm11, %zmm11 + vpmaxsd %zmm4, %zmm1, %zmm1 + vpminsd %zmm12, %zmm8, %zmm13 + vpmaxsd %zmm12, %zmm8, %zmm4 + vpminsd %zmm3, %zmm2, %zmm8 + vpmaxsd %zmm3, %zmm2, %zmm2 + vpermd %zmm10, %zmm0, %zmm3 + vpminsd %zmm3, %zmm10, %zmm12 + vpmaxsd %zmm3, %zmm10, %zmm10 + vpermd %zmm6, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm10{%k1} + vpminsd %zmm3, %zmm6, %zmm12 + vpmaxsd %zmm3, %zmm6, %zmm6 + vpermd %zmm18, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm6{%k1} + vpminsd %zmm19, %zmm7, %zmm15 + vpminsd %zmm3, %zmm18, %zmm12 + vpmaxsd %zmm3, %zmm18, %zmm18 + vpermd %zmm5, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm18{%k1} + vpminsd %zmm3, %zmm5, %zmm12 + vpmaxsd %zmm3, %zmm5, %zmm5 + vpermd %zmm17, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm5{%k1} + vpmaxsd %zmm19, %zmm7, %zmm7 + vpminsd %zmm3, %zmm17, %zmm12 + vpmaxsd %zmm3, %zmm17, %zmm17 + vpermd %zmm9, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm17{%k1} + vpminsd %zmm3, %zmm9, %zmm12 + vpmaxsd %zmm3, %zmm9, %zmm9 + vpermd %zmm16, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm9{%k1} + vshufi32x4 $177, %zmm5, %zmm5, %zmm22 + vpminsd %zmm3, %zmm16, %zmm12 + vpmaxsd %zmm3, %zmm16, %zmm16 + vpermd %zmm1, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm16{%k1} + vpminsd %zmm3, %zmm1, %zmm12 + vpmaxsd %zmm3, %zmm1, %zmm1 + vpermd %zmm15, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm1{%k1} + vshufi32x4 $177, %zmm17, %zmm17, %zmm21 + vpminsd %zmm3, %zmm15, %zmm12 + vpmaxsd %zmm3, %zmm15, %zmm15 + vpermd %zmm7, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm15{%k1} + vpminsd %zmm3, %zmm7, %zmm12 + vpmaxsd %zmm3, %zmm7, %zmm7 + vpermd %zmm14, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm7{%k1} + vshufi32x4 $177, %zmm9, %zmm9, %zmm20 + vpminsd %zmm3, %zmm14, %zmm12 + vpmaxsd %zmm3, %zmm14, %zmm14 + vpermd %zmm11, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm14{%k1} + vpminsd %zmm3, %zmm11, %zmm12 + vpmaxsd %zmm3, %zmm11, %zmm3 + vpermd %zmm13, %zmm0, %zmm11 + vmovdqu16 %zmm12, %zmm3{%k1} + vshufi32x4 $177, %zmm16, %zmm16, %zmm19 + vpminsd %zmm11, %zmm13, %zmm12 + vpmaxsd %zmm11, %zmm13, %zmm13 + vpermd %zmm4, %zmm0, %zmm11 + vmovdqu16 %zmm12, %zmm13{%k1} + vpminsd %zmm11, %zmm4, %zmm12 + vpmaxsd %zmm11, %zmm4, %zmm4 + vpermd %zmm8, %zmm0, %zmm11 + vmovdqu16 %zmm12, %zmm4{%k1} + vpermd %zmm2, %zmm0, %zmm0 + vpminsd %zmm11, %zmm8, %zmm12 + vpmaxsd %zmm11, %zmm8, %zmm8 + vmovdqu16 %zmm12, %zmm8{%k1} + vpminsd %zmm0, %zmm2, %zmm11 + vshufi32x4 $177, %zmm10, %zmm10, %zmm12 + vpmaxsd %zmm0, %zmm2, %zmm0 + vpminsd %zmm10, %zmm12, %zmm2 + vmovdqu16 %zmm11, %zmm0{%k1} + vpmaxsd %zmm10, %zmm12, %zmm12 + kmovb %eax, %k1 + vshufi32x4 $177, %zmm6, %zmm6, %zmm11 + vmovdqa64 %zmm2, %zmm12{%k1} + vpminsd %zmm6, %zmm11, %zmm2 + vpmaxsd %zmm6, %zmm11, %zmm11 + movl $85, %eax + vshufi32x4 $177, %zmm18, %zmm18, %zmm10 + vmovdqa64 %zmm2, %zmm11{%k1} + vshufi32x4 $177, %zmm14, %zmm14, %zmm6 + vpminsd %zmm18, %zmm10, %zmm2 + vpmaxsd %zmm18, %zmm10, %zmm10 + vshufi32x4 $177, %zmm15, %zmm15, %zmm18 + vmovdqa64 %zmm2, %zmm10{%k1} + vpminsd %zmm5, %zmm22, %zmm2 + vpmaxsd %zmm5, %zmm22, %zmm22 + vmovdqa64 %zmm2, %zmm22{%k1} + vpminsd %zmm17, %zmm21, %zmm2 + vpmaxsd %zmm17, %zmm21, %zmm21 + vmovdqa64 %zmm2, %zmm21{%k1} + vpminsd %zmm9, %zmm20, %zmm2 + vpmaxsd %zmm9, %zmm20, %zmm20 + vmovdqa64 %zmm2, %zmm20{%k1} + vpminsd %zmm16, %zmm19, %zmm2 + vpmaxsd %zmm16, %zmm19, %zmm19 + vmovdqa64 %zmm2, %zmm19{%k1} + vshufi32x4 $177, %zmm1, %zmm1, %zmm2 + vshufi32x4 $177, %zmm7, %zmm7, %zmm17 + vpminsd %zmm1, %zmm2, %zmm5 + vpmaxsd %zmm1, %zmm2, %zmm2 + vshufi32x4 $177, %zmm3, %zmm3, %zmm16 + vpminsd %zmm15, %zmm18, %zmm1 + vpmaxsd %zmm15, %zmm18, %zmm18 + vshufi32x4 $177, %zmm13, %zmm13, %zmm15 + vmovdqa64 %zmm1, %zmm18{%k1} + vpminsd %zmm7, %zmm17, %zmm1 + vpmaxsd %zmm7, %zmm17, %zmm17 + vmovdqa64 %zmm1, %zmm17{%k1} + vpminsd %zmm14, %zmm6, %zmm1 + vpmaxsd %zmm14, %zmm6, %zmm6 + vmovdqa64 %zmm1, %zmm6{%k1} + vpminsd %zmm3, %zmm16, %zmm1 + vpmaxsd %zmm3, %zmm16, %zmm16 + vmovdqa64 %zmm1, %zmm16{%k1} + vshufi32x4 $177, %zmm4, %zmm4, %zmm14 + vpminsd %zmm13, %zmm15, %zmm1 + vpmaxsd %zmm13, %zmm15, %zmm15 + vshufi32x4 $177, %zmm8, %zmm8, %zmm9 + vmovdqa64 %zmm5, %zmm2{%k1} + vmovdqa64 %zmm1, %zmm15{%k1} + vpminsd %zmm4, %zmm14, %zmm1 + vpmaxsd %zmm4, %zmm14, %zmm14 + vmovdqa64 %zmm1, %zmm14{%k1} + vshufi32x4 $177, %zmm0, %zmm0, %zmm5 + vpminsd %zmm8, %zmm9, %zmm1 + vpshufd $78, %zmm12, %zmm13 + vpmaxsd %zmm8, %zmm9, %zmm9 + vpshufd $78, %zmm21, %zmm7 + vmovdqa64 %zmm1, %zmm9{%k1} + vpminsd %zmm0, %zmm5, %zmm1 + vpmaxsd %zmm0, %zmm5, %zmm5 + vpminsd %zmm12, %zmm13, %zmm0 + vpmaxsd %zmm12, %zmm13, %zmm13 + vpshufd $78, %zmm11, %zmm12 + vmovdqa64 %zmm1, %zmm5{%k1} + kmovb %eax, %k1 + vmovdqa64 %zmm0, %zmm13{%k1} + vpminsd %zmm11, %zmm12, %zmm0 + vpmaxsd %zmm11, %zmm12, %zmm12 + vpshufd $78, %zmm10, %zmm11 + vpshufd $78, %zmm20, %zmm4 + movl $21845, %eax + vmovdqa64 %zmm0, %zmm12{%k1} + vpminsd %zmm10, %zmm11, %zmm0 + vpmaxsd %zmm10, %zmm11, %zmm11 + vpshufd $78, %zmm22, %zmm10 + vmovdqa64 %zmm0, %zmm11{%k1} + vpshufd $78, %zmm19, %zmm3 + vpminsd %zmm22, %zmm10, %zmm0 + vpmaxsd %zmm22, %zmm10, %zmm10 + vpshufd $78, %zmm2, %zmm1 + vmovdqa64 %zmm0, %zmm10{%k1} + vpminsd %zmm21, %zmm7, %zmm0 + vpmaxsd %zmm21, %zmm7, %zmm7 + vmovdqa64 %zmm0, %zmm7{%k1} + vpminsd %zmm20, %zmm4, %zmm0 + vpmaxsd %zmm20, %zmm4, %zmm4 + vmovdqa64 %zmm0, %zmm4{%k1} + vpminsd %zmm19, %zmm3, %zmm0 + vpmaxsd %zmm19, %zmm3, %zmm3 + vmovdqa64 %zmm0, %zmm3{%k1} + vpminsd %zmm2, %zmm1, %zmm0 + vpmaxsd %zmm2, %zmm1, %zmm1 + vpshufd $78, %zmm18, %zmm2 + vmovdqa64 %zmm0, %zmm1{%k1} + vpshufd $78, %zmm15, %zmm21 + vpminsd %zmm18, %zmm2, %zmm0 + vpmaxsd %zmm18, %zmm2, %zmm2 + vpshufd $78, %zmm14, %zmm20 + vmovdqa64 %zmm0, %zmm2{%k1} + vpshufd $78, %zmm17, %zmm0 + vpshufd $78, %zmm9, %zmm19 + vpminsd %zmm17, %zmm0, %zmm8 + vpmaxsd %zmm17, %zmm0, %zmm0 + vmovdqa64 %zmm8, %zmm0{%k1} + vpshufd $78, %zmm6, %zmm8 + vpminsd %zmm6, %zmm8, %zmm17 + vpmaxsd %zmm6, %zmm8, %zmm8 + vpshufd $78, %zmm16, %zmm6 + vmovdqa64 %zmm17, %zmm8{%k1} + vpminsd %zmm16, %zmm6, %zmm17 + vpmaxsd %zmm16, %zmm6, %zmm6 + vpminsd %zmm15, %zmm21, %zmm16 + vpmaxsd %zmm15, %zmm21, %zmm21 + vmovdqa64 %zmm17, %zmm6{%k1} + vpminsd %zmm14, %zmm20, %zmm15 + vpmaxsd %zmm14, %zmm20, %zmm20 + vmovdqa64 %zmm16, %zmm21{%k1} + vpminsd %zmm9, %zmm19, %zmm14 + vpmaxsd %zmm9, %zmm19, %zmm19 + vpshufd $78, %zmm5, %zmm9 + vmovdqa64 %zmm14, %zmm19{%k1} + vpminsd %zmm5, %zmm9, %zmm14 + vpmaxsd %zmm5, %zmm9, %zmm9 + vmovdqa64 %zmm14, %zmm9{%k1} + vpshufd $177, %zmm13, %zmm14 + vmovdqa64 %zmm15, %zmm20{%k1} + kmovw %eax, %k1 + vpmaxsd %zmm13, %zmm14, %zmm5 + vpminsd %zmm13, %zmm14, %zmm5{%k1} + vpshufd $177, %zmm12, %zmm14 + vpmaxsd %zmm12, %zmm14, %zmm13 + vpminsd %zmm12, %zmm14, %zmm13{%k1} + vpshufd $177, %zmm11, %zmm12 + vpmaxsd %zmm11, %zmm12, %zmm18 + vpminsd %zmm11, %zmm12, %zmm18{%k1} + vpshufd $177, %zmm10, %zmm11 + vpmaxsd %zmm10, %zmm11, %zmm17 + vpminsd %zmm10, %zmm11, %zmm17{%k1} + vpshufd $177, %zmm7, %zmm10 + vpmaxsd %zmm7, %zmm10, %zmm14 + vpminsd %zmm7, %zmm10, %zmm14{%k1} + vpshufd $177, %zmm4, %zmm7 + vpmaxsd %zmm4, %zmm7, %zmm16 + vpminsd %zmm4, %zmm7, %zmm16{%k1} + vpshufd $177, %zmm3, %zmm4 + vpmaxsd %zmm3, %zmm4, %zmm15 + vpminsd %zmm3, %zmm4, %zmm15{%k1} + vpshufd $177, %zmm1, %zmm3 + vpshufd $177, %zmm9, %zmm4 + vpmaxsd %zmm1, %zmm3, %zmm12 + vpminsd %zmm1, %zmm3, %zmm12{%k1} + vpshufd $177, %zmm0, %zmm1 + vpshufd $177, %zmm2, %zmm3 + vpmaxsd %zmm0, %zmm1, %zmm10 + vpmaxsd %zmm2, %zmm3, %zmm7 + vpminsd %zmm0, %zmm1, %zmm10{%k1} + vpshufd $177, %zmm8, %zmm1 + vpshufd $177, %zmm6, %zmm0 + vpminsd %zmm2, %zmm3, %zmm7{%k1} + vpmaxsd %zmm8, %zmm1, %zmm2 + vpminsd %zmm8, %zmm1, %zmm2{%k1} + vpmaxsd %zmm6, %zmm0, %zmm8 + vpshufd $177, %zmm20, %zmm1 + vpminsd %zmm6, %zmm0, %zmm8{%k1} + vpshufd $177, %zmm21, %zmm0 + vpmaxsd %zmm20, %zmm1, %zmm3 + vpmaxsd %zmm21, %zmm0, %zmm6 + vpminsd %zmm20, %zmm1, %zmm3{%k1} + vpminsd %zmm21, %zmm0, %zmm6{%k1} + vpshufd $177, %zmm19, %zmm0 + vpmaxsd %zmm19, %zmm0, %zmm1 + vpminsd %zmm19, %zmm0, %zmm1{%k1} + vpmaxsd %zmm9, %zmm4, %zmm0 + vpminsd %zmm9, %zmm4, %zmm0{%k1} +.L160: + vmovq %xmm26, %rax + vmovdqu64 %zmm5, (%rdi) + vmovdqu64 %zmm13, (%r15) + vmovdqu64 %zmm18, (%r14) + vmovdqu64 %zmm17, 0(%r13) + vmovdqu64 %zmm14, (%r12) + vmovdqu64 %zmm16, (%rbx) + vmovdqu64 %zmm15, (%r11) + vmovdqu64 %zmm12, (%r10) + vmovdqu64 %zmm7, (%r9) + vmovdqu64 %zmm10, (%r8) + vmovdqu64 %zmm2, (%rax) + vmovq %xmm25, %rax + vmovdqu64 %zmm8, (%rax) + vmovq %xmm27, %rax + vmovdqu64 %zmm6, (%rax) + vmovq %xmm24, %rax + vmovdqu64 %zmm3, (%rdx) + vmovdqu64 %zmm1, (%rax) + movq (%rsp), %rax + vmovdqu64 %zmm0, (%rax) + vzeroupper + leaq -40(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE18786: + .size _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0, .-_ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + .section .text._ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, @function +_ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0: +.LFB18787: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsi, %r10 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %rbx + .cfi_offset 3, -24 + cmpq $15, %rsi + jbe .L181 + vmovdqa32 %zmm0, %zmm2 + vmovdqa32 %zmm1, %zmm3 + movl $16, %r8d + xorl %esi, %esi + jmp .L172 + .p2align 4,,10 + .p2align 3 +.L167: + vmovdqu64 %zmm0, (%rax) + kmovw %k0, %eax + popcntq %rax, %rax + addq %rax, %rsi + leaq 16(%r8), %rax + cmpq %r10, %rax + ja .L192 + movq %rax, %r8 +.L172: + vmovdqu32 -64(%rdi,%r8,4), %zmm4 + leaq -16(%r8), %r9 + leaq (%rdi,%rsi,4), %rax + vpcmpd $0, %zmm2, %zmm4, %k0 + vpcmpd $0, %zmm3, %zmm4, %k1 + kmovw %k0, %r11d + kmovw %k1, %ebx + korw %k1, %k0, %k1 + kortestw %k1, %k1 + jc .L167 + kmovw %r11d, %k6 + kmovw %ebx, %k5 + kxnorw %k5, %k6, %k7 + kmovw %k7, %eax + tzcntl %eax, %eax + addq %r9, %rax + vpbroadcastd (%rdi,%rax,4), %zmm0 + leaq 16(%rsi), %rax + vmovdqa32 %zmm0, (%rdx) + cmpq %r9, %rax + ja .L168 + .p2align 4,,10 + .p2align 3 +.L169: + vmovdqu64 %zmm1, -64(%rdi,%rax,4) + movq %rax, %rsi + addq $16, %rax + cmpq %rax, %r9 + jnb .L169 +.L168: + subq %rsi, %r9 + leaq (%rdi,%rsi,4), %rdx + movl $65535, %eax + cmpq $255, %r9 + jbe .L193 +.L170: + kmovw %eax, %k4 + xorl %eax, %eax + vmovdqu32 %zmm1, (%rdx){%k4} +.L165: + movq -8(%rbp), %rbx + leave + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L193: + .cfi_restore_state + movq $-1, %rax + bzhi %r9, %rax, %rax + movzwl %ax, %eax + jmp .L170 + .p2align 4,,10 + .p2align 3 +.L192: + movq %r10, %r11 + leaq (%rdi,%r8,4), %rbx + leaq (%rdi,%rsi,4), %r9 + movl $65535, %eax + subq %r8, %r11 + kmovd %eax, %k1 + cmpq $255, %r11 + jbe .L166 +.L173: + vmovdqu32 (%rbx), %zmm2{%k1}{z} + knotw %k1, %k3 + vmovdqu32 %zmm2, (%rcx){%k1} + vmovdqa32 (%rcx), %zmm2 + vpcmpd $0, %zmm0, %zmm2, %k0 + vpcmpd $0, %zmm1, %zmm2, %k2 + kandw %k1, %k0, %k0 + korw %k2, %k0, %k2 + korw %k3, %k2, %k2 + kortestw %k2, %k2 + jnc .L194 + kmovw %k0, %edx + popcntq %rdx, %rdx + addq %rsi, %rdx + vmovdqu32 %zmm0, (%r9){%k1} + leaq 16(%rdx), %rax + cmpq %r10, %rax + ja .L178 + .p2align 4,,10 + .p2align 3 +.L179: + vmovdqu64 %zmm1, -64(%rdi,%rax,4) + movq %rax, %rdx + addq $16, %rax + cmpq %rax, %r10 + jnb .L179 +.L178: + subq %rdx, %r10 + leaq (%rdi,%rdx,4), %rcx + movl $65535, %eax + cmpq $255, %r10 + ja .L180 + movq $-1, %rax + bzhi %r10, %rax, %rax + movzwl %ax, %eax +.L180: + kmovw %eax, %k5 + movl $1, %eax + vmovdqu32 %zmm1, (%rcx){%k5} + movq -8(%rbp), %rbx + leave + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret +.L181: + .cfi_restore_state + movq %rsi, %r11 + movq %rdi, %r9 + movq %rdi, %rbx + xorl %esi, %esi + xorl %r8d, %r8d +.L166: + movq $-1, %rax + bzhi %r11, %rax, %rax + movzwl %ax, %eax + kmovd %eax, %k1 + jmp .L173 +.L194: + knotw %k2, %k3 + kmovw %k3, %eax + tzcntl %eax, %eax + addq %r8, %rax + vpbroadcastd (%rdi,%rax,4), %zmm0 + leaq 16(%rsi), %rax + vmovdqa32 %zmm0, (%rdx) + cmpq %r8, %rax + ja .L175 + .p2align 4,,10 + .p2align 3 +.L176: + vmovdqu64 %zmm1, -64(%rdi,%rax,4) + movq %rax, %rsi + leaq 16(%rax), %rax + cmpq %rax, %r8 + jnb .L176 + leaq (%rdi,%rsi,4), %r9 +.L175: + subq %rsi, %r8 + movl $65535, %eax + cmpq $255, %r8 + ja .L177 + movq $-1, %rax + bzhi %r8, %rax, %rax + movzwl %ax, %eax +.L177: + kmovw %eax, %k6 + xorl %eax, %eax + vmovdqu32 %zmm1, (%r9){%k6} + jmp .L165 + .cfi_endproc +.LFE18787: + .size _ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, .-_ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + .section .text._ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0, @function +_ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0: +.LFB18788: + .cfi_startproc + movq %rsi, %r8 + movq %rdx, %rcx + cmpq %rdx, %rsi + jbe .L205 + leaq (%rdx,%rdx), %rdx + leaq 1(%rcx), %r10 + leaq 1(%rdx), %rsi + addq $2, %rdx + cmpq %rsi, %r8 + jbe .L205 + movl (%rdi,%rcx,4), %r11d + vpbroadcastd %r11d, %xmm1 + jmp .L198 + .p2align 4,,10 + .p2align 3 +.L208: + movq %rsi, %rax + cmpq %rdx, %r8 + ja .L206 +.L200: + cmpq %rcx, %rax + je .L205 + leaq (%rdi,%rax,4), %rdx + movl (%rdx), %ecx + movl %ecx, (%r9) + movl %r11d, (%rdx) + cmpq %rax, %r8 + jbe .L207 + leaq (%rax,%rax), %rdx + leaq 1(%rax), %r10 + leaq 1(%rdx), %rsi + addq $2, %rdx + cmpq %r8, %rsi + jnb .L205 + movq %rax, %rcx +.L198: + vpbroadcastd (%rdi,%rsi,4), %xmm0 + leaq (%rdi,%rcx,4), %r9 + vpcmpd $6, %xmm1, %xmm0, %k0 + kmovb %k0, %eax + testb $1, %al + jne .L208 + cmpq %rdx, %r8 + jbe .L205 + vpbroadcastd (%rdi,%r10,8), %xmm0 + vpcmpd $6, %xmm1, %xmm0, %k1 + kmovb %k1, %eax + testb $1, %al + je .L205 + movq %rdx, %rax + jmp .L200 + .p2align 4,,10 + .p2align 3 +.L205: + ret + .p2align 4,,10 + .p2align 3 +.L206: + vpbroadcastd (%rdi,%r10,8), %xmm2 + vpcmpd $6, %xmm0, %xmm2, %k2 + kmovb %k2, %esi + andl $1, %esi + cmovne %rdx, %rax + jmp .L200 + .p2align 4,,10 + .p2align 3 +.L207: + ret + .cfi_endproc +.LFE18788: + .size _ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0, .-_ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + .section .text._ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0, @function +_ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0: +.LFB18789: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + leaq 0(,%rsi,4), %rax + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + leaq (%rdi,%rax), %r15 + pushq %r14 + .cfi_offset 14, -32 + leaq (%r15,%rax), %r14 + pushq %r13 + .cfi_offset 13, -40 + leaq (%r14,%rax), %r13 + pushq %r12 + .cfi_offset 12, -48 + leaq 0(%r13,%rax), %r12 + pushq %rbx + .cfi_offset 3, -56 + leaq (%r12,%rax), %rbx + leaq (%rbx,%rax), %r11 + leaq (%r11,%rax), %r10 + leaq (%r10,%rax), %r9 + andq $-64, %rsp + leaq (%r9,%rax), %r8 + subq $8, %rsp + leaq (%r8,%rax), %rdx + leaq (%rdx,%rax), %rcx + vmovq %rdx, %xmm26 + leaq (%rcx,%rax), %rdx + vmovq %rcx, %xmm25 + vmovdqu32 (%rdi), %zmm7 + vpminsd (%r15), %zmm7, %zmm15 + vmovq %rdx, %xmm27 + addq %rax, %rdx + vpmaxsd (%r15), %zmm7, %zmm0 + vmovdqu32 (%r14), %zmm7 + leaq (%rdx,%rax), %rcx + vpminsd 0(%r13), %zmm7, %zmm16 + addq %rcx, %rax + vmovq %rcx, %xmm24 + vmovq %xmm26, %rcx + movq %rax, (%rsp) + vmovq %xmm25, %rax + vpmaxsd 0(%r13), %zmm7, %zmm10 + vpminsd %zmm16, %zmm15, %zmm12 + vmovdqu32 (%r12), %zmm7 + vpminsd (%rbx), %zmm7, %zmm11 + vpmaxsd %zmm16, %zmm15, %zmm15 + vpmaxsd (%rbx), %zmm7, %zmm2 + vmovdqu32 (%r11), %zmm7 + vpminsd %zmm10, %zmm0, %zmm16 + vpminsd (%r10), %zmm7, %zmm8 + vpmaxsd (%r10), %zmm7, %zmm6 + vmovdqu32 (%r9), %zmm7 + vpminsd (%r8), %zmm7, %zmm1 + vpmaxsd %zmm10, %zmm0, %zmm0 + vpmaxsd (%r8), %zmm7, %zmm4 + vmovdqu32 (%rcx), %zmm7 + vpminsd %zmm8, %zmm11, %zmm10 + vpminsd (%rax), %zmm7, %zmm9 + vpmaxsd (%rax), %zmm7, %zmm13 + vmovq %xmm27, %rax + vmovdqu32 (%rax), %zmm7 + movq (%rsp), %rax + vpmaxsd %zmm8, %zmm11, %zmm11 + vpminsd %zmm6, %zmm2, %zmm8 + vpminsd (%rdx), %zmm7, %zmm3 + vpmaxsd (%rdx), %zmm7, %zmm5 + vmovdqu64 (%rax), %zmm7 + vmovq %xmm24, %rax + vpmaxsd %zmm6, %zmm2, %zmm2 + vpminsd %zmm9, %zmm1, %zmm6 + vpmaxsd %zmm9, %zmm1, %zmm1 + vpminsd %zmm13, %zmm4, %zmm9 + vmovdqa64 %zmm7, -120(%rsp) + vmovdqa32 -120(%rsp), %zmm7 + vpminsd (%rax), %zmm7, %zmm14 + vpmaxsd %zmm13, %zmm4, %zmm4 + vpmaxsd (%rax), %zmm7, %zmm7 + vpminsd %zmm14, %zmm3, %zmm13 + vpmaxsd %zmm14, %zmm3, %zmm3 + vpminsd %zmm7, %zmm5, %zmm14 + vpmaxsd %zmm7, %zmm5, %zmm5 + vpminsd %zmm10, %zmm12, %zmm7 + vpmaxsd %zmm10, %zmm12, %zmm12 + vpminsd %zmm8, %zmm16, %zmm10 + vpmaxsd %zmm8, %zmm16, %zmm16 + vpminsd %zmm11, %zmm15, %zmm8 + vpmaxsd %zmm11, %zmm15, %zmm15 + vpminsd %zmm2, %zmm0, %zmm11 + vpmaxsd %zmm2, %zmm0, %zmm0 + vpminsd %zmm13, %zmm6, %zmm2 + vpmaxsd %zmm13, %zmm6, %zmm6 + vpminsd %zmm14, %zmm9, %zmm13 + vpmaxsd %zmm14, %zmm9, %zmm9 + vpminsd %zmm3, %zmm1, %zmm14 + vpmaxsd %zmm3, %zmm1, %zmm1 + vpminsd %zmm5, %zmm4, %zmm3 + vpmaxsd %zmm5, %zmm4, %zmm4 + vpminsd %zmm2, %zmm7, %zmm5 + vpmaxsd %zmm2, %zmm7, %zmm7 + vpminsd %zmm13, %zmm10, %zmm2 + vpmaxsd %zmm13, %zmm10, %zmm10 + vpminsd %zmm14, %zmm8, %zmm13 + vpmaxsd %zmm14, %zmm8, %zmm8 + vpminsd %zmm3, %zmm11, %zmm14 + vpmaxsd %zmm3, %zmm11, %zmm11 + vpminsd %zmm6, %zmm12, %zmm3 + vpmaxsd %zmm6, %zmm12, %zmm12 + vpminsd %zmm9, %zmm16, %zmm6 + vpmaxsd %zmm9, %zmm16, %zmm16 + vpminsd %zmm1, %zmm15, %zmm9 + vpmaxsd %zmm1, %zmm15, %zmm15 + vpminsd %zmm4, %zmm0, %zmm1 + vpmaxsd %zmm4, %zmm0, %zmm0 + vpminsd %zmm8, %zmm6, %zmm4 + vpmaxsd %zmm8, %zmm6, %zmm6 + vpminsd %zmm10, %zmm9, %zmm8 + vpmaxsd %zmm10, %zmm9, %zmm9 + vpminsd %zmm12, %zmm14, %zmm10 + vpmaxsd %zmm12, %zmm14, %zmm14 + vpminsd %zmm11, %zmm1, %zmm12 + vpmaxsd %zmm11, %zmm1, %zmm1 + vpminsd %zmm15, %zmm16, %zmm11 + vpmaxsd %zmm15, %zmm16, %zmm16 + vpminsd %zmm7, %zmm3, %zmm15 + vpmaxsd %zmm7, %zmm3, %zmm3 + vpminsd %zmm13, %zmm2, %zmm7 + vpmaxsd %zmm13, %zmm2, %zmm2 + vpminsd %zmm15, %zmm7, %zmm13 + vpmaxsd %zmm15, %zmm7, %zmm7 + vpminsd %zmm11, %zmm12, %zmm15 + vpmaxsd %zmm11, %zmm12, %zmm12 + vpminsd %zmm3, %zmm2, %zmm11 + vpmaxsd %zmm3, %zmm2, %zmm2 + vpminsd %zmm16, %zmm1, %zmm3 + vpminsd %zmm7, %zmm11, %zmm18 + vpmaxsd %zmm7, %zmm11, %zmm11 + vpminsd %zmm8, %zmm4, %zmm7 + vpmaxsd %zmm8, %zmm4, %zmm4 + vpminsd %zmm6, %zmm9, %zmm8 + vpmaxsd %zmm6, %zmm9, %zmm9 + vpminsd %zmm12, %zmm3, %zmm6 + vpmaxsd %zmm12, %zmm3, %zmm3 + vpminsd %zmm2, %zmm10, %zmm12 + vpmaxsd %zmm2, %zmm10, %zmm10 + vpminsd %zmm14, %zmm15, %zmm2 + vpmaxsd %zmm14, %zmm15, %zmm15 + vpminsd %zmm7, %zmm12, %zmm14 + vpmaxsd %zmm7, %zmm12, %zmm12 + vpminsd %zmm10, %zmm4, %zmm7 + vpmaxsd %zmm10, %zmm4, %zmm4 + vpminsd %zmm8, %zmm2, %zmm10 + vpmaxsd %zmm8, %zmm2, %zmm2 + vpminsd %zmm15, %zmm9, %zmm8 + vpmaxsd %zmm16, %zmm1, %zmm1 + vpmaxsd %zmm15, %zmm9, %zmm9 + vpminsd %zmm7, %zmm12, %zmm16 + vpmaxsd %zmm7, %zmm12, %zmm12 + vpminsd %zmm4, %zmm10, %zmm7 + vpmaxsd %zmm4, %zmm10, %zmm10 + vpminsd %zmm8, %zmm2, %zmm4 + vpminsd %zmm7, %zmm12, %zmm15 + vpminsd %zmm11, %zmm14, %zmm17 + vpmaxsd %zmm8, %zmm2, %zmm2 + vpmaxsd %zmm7, %zmm12, %zmm12 + vpminsd %zmm9, %zmm6, %zmm8 + vpminsd %zmm4, %zmm10, %zmm7 + vpmaxsd %zmm11, %zmm14, %zmm14 + vpmaxsd %zmm9, %zmm6, %zmm6 + vpmaxsd %zmm4, %zmm10, %zmm10 + cmpq $1, %rsi + jbe .L211 + vpshufd $177, %zmm2, %zmm2 + vpshufd $177, %zmm0, %zmm0 + vpshufd $177, %zmm8, %zmm8 + movl $21845, %eax + vpminsd %zmm0, %zmm5, %zmm22 + vpshufd $177, %zmm3, %zmm3 + vpmaxsd %zmm0, %zmm5, %zmm9 + kmovw %eax, %k1 + vpminsd %zmm2, %zmm16, %zmm5 + vpshufd $177, %zmm10, %zmm10 + vpminsd %zmm3, %zmm18, %zmm20 + vpshufd $177, %zmm7, %zmm7 + vpshufd $177, %zmm1, %zmm1 + vpmaxsd %zmm3, %zmm18, %zmm0 + vpminsd %zmm8, %zmm14, %zmm11 + vpminsd %zmm10, %zmm15, %zmm4 + vpshufd $177, %zmm6, %zmm6 + vpshufd $177, %zmm5, %zmm5 + vpminsd %zmm1, %zmm13, %zmm21 + vpminsd %zmm6, %zmm17, %zmm19 + vpmaxsd %zmm2, %zmm16, %zmm16 + vpminsd %zmm7, %zmm12, %zmm3 + vpshufd $177, %zmm9, %zmm9 + vpminsd %zmm5, %zmm20, %zmm2 + vpmaxsd %zmm1, %zmm13, %zmm13 + vpshufd $177, %zmm11, %zmm11 + vpmaxsd %zmm7, %zmm12, %zmm1 + vpmaxsd %zmm6, %zmm17, %zmm6 + vpshufd $177, %zmm0, %zmm0 + vpshufd $177, %zmm4, %zmm4 + vpminsd %zmm9, %zmm1, %zmm18 + vpmaxsd %zmm8, %zmm14, %zmm14 + vpminsd %zmm4, %zmm21, %zmm12 + vpmaxsd %zmm10, %zmm15, %zmm10 + vpshufd $177, %zmm3, %zmm7 + vpshufd $177, %zmm2, %zmm2 + vpminsd %zmm11, %zmm19, %zmm3 + vpshufd $177, %zmm6, %zmm6 + vpshufd $177, %zmm13, %zmm13 + vpmaxsd %zmm9, %zmm1, %zmm1 + vpmaxsd %zmm4, %zmm21, %zmm4 + vpminsd %zmm0, %zmm16, %zmm9 + vpminsd %zmm7, %zmm22, %zmm17 + vpshufd $177, %zmm4, %zmm4 + vpminsd %zmm13, %zmm10, %zmm15 + vpmaxsd %zmm5, %zmm20, %zmm5 + vpshufd $177, %zmm3, %zmm3 + vpmaxsd %zmm0, %zmm16, %zmm0 + vpminsd %zmm6, %zmm14, %zmm8 + vpshufd $177, %zmm9, %zmm16 + vpmaxsd %zmm6, %zmm14, %zmm6 + vpminsd %zmm2, %zmm12, %zmm20 + vpshufd $177, %zmm1, %zmm1 + vpmaxsd %zmm7, %zmm22, %zmm7 + vpmaxsd %zmm13, %zmm10, %zmm13 + vpshufd $177, %zmm8, %zmm8 + vpminsd %zmm3, %zmm17, %zmm14 + vpmaxsd %zmm11, %zmm19, %zmm11 + vpshufd $177, %zmm13, %zmm13 + vpmaxsd %zmm2, %zmm12, %zmm12 + vpminsd %zmm1, %zmm6, %zmm19 + vpshufd $177, %zmm20, %zmm2 + vpminsd %zmm4, %zmm5, %zmm9 + vpshufd $177, %zmm7, %zmm7 + vpmaxsd %zmm4, %zmm5, %zmm5 + vpmaxsd %zmm1, %zmm6, %zmm1 + vpminsd %zmm16, %zmm15, %zmm4 + vpshufd $177, %zmm9, %zmm9 + vpminsd %zmm7, %zmm11, %zmm10 + vpshufd $177, %zmm4, %zmm4 + vpshufd $177, %zmm1, %zmm1 + vpmaxsd %zmm3, %zmm17, %zmm3 + vpmaxsd %zmm7, %zmm11, %zmm7 + vpminsd %zmm8, %zmm18, %zmm17 + vpmaxsd %zmm8, %zmm18, %zmm8 + vpshufd $177, %zmm7, %zmm7 + vpminsd %zmm13, %zmm0, %zmm18 + vpmaxsd %zmm13, %zmm0, %zmm0 + vpshufd $177, %zmm3, %zmm3 + vpminsd %zmm2, %zmm14, %zmm13 + vpminsd %zmm4, %zmm17, %zmm23 + vpshufd $177, %zmm18, %zmm18 + vpminsd %zmm1, %zmm0, %zmm20 + vpmaxsd %zmm16, %zmm15, %zmm16 + vpshufd $177, %zmm8, %zmm8 + vpminsd %zmm9, %zmm10, %zmm15 + vpmaxsd %zmm9, %zmm10, %zmm10 + vpmaxsd %zmm4, %zmm17, %zmm9 + vpmaxsd %zmm1, %zmm0, %zmm4 + vpshufd $177, %zmm13, %zmm0 + vpmaxsd %zmm2, %zmm14, %zmm11 + vpminsd %zmm3, %zmm12, %zmm14 + vpmaxsd %zmm7, %zmm5, %zmm2 + vpmaxsd %zmm3, %zmm12, %zmm3 + vpminsd %zmm7, %zmm5, %zmm12 + vpmaxsd %zmm0, %zmm13, %zmm5 + vpminsd %zmm0, %zmm13, %zmm5{%k1} + vpshufd $177, %zmm11, %zmm0 + vpminsd %zmm18, %zmm19, %zmm21 + vpmaxsd %zmm0, %zmm11, %zmm13 + vpmaxsd %zmm18, %zmm19, %zmm19 + vpminsd %zmm0, %zmm11, %zmm13{%k1} + vpshufd $177, %zmm14, %zmm0 + vpminsd %zmm8, %zmm16, %zmm22 + vpmaxsd %zmm0, %zmm14, %zmm18 + vpmaxsd %zmm8, %zmm16, %zmm6 + vpshufd $177, %zmm22, %zmm1 + vpminsd %zmm0, %zmm14, %zmm18{%k1} + vpshufd $177, %zmm3, %zmm0 + vpmaxsd %zmm0, %zmm3, %zmm17 + vpminsd %zmm0, %zmm3, %zmm17{%k1} + vpshufd $177, %zmm15, %zmm0 + vpshufd $177, %zmm4, %zmm3 + vpmaxsd %zmm0, %zmm15, %zmm14 + vpminsd %zmm0, %zmm15, %zmm14{%k1} + vpshufd $177, %zmm10, %zmm0 + vpmaxsd %zmm0, %zmm10, %zmm16 + vpminsd %zmm0, %zmm10, %zmm16{%k1} + vpshufd $177, %zmm12, %zmm0 + vpmaxsd %zmm0, %zmm12, %zmm15 + vpminsd %zmm0, %zmm12, %zmm15{%k1} + vpshufd $177, %zmm2, %zmm0 + vpmaxsd %zmm0, %zmm2, %zmm12 + vpminsd %zmm0, %zmm2, %zmm12{%k1} + vpshufd $177, %zmm23, %zmm0 + vpmaxsd %zmm1, %zmm22, %zmm2 + vpmaxsd %zmm0, %zmm23, %zmm7 + vpminsd %zmm1, %zmm22, %zmm2{%k1} + vpminsd %zmm0, %zmm23, %zmm7{%k1} + vpshufd $177, %zmm9, %zmm0 + vpmaxsd %zmm0, %zmm9, %zmm10 + vpminsd %zmm0, %zmm9, %zmm10{%k1} + vpshufd $177, %zmm6, %zmm0 + vpmaxsd %zmm0, %zmm6, %zmm8 + vpminsd %zmm0, %zmm6, %zmm8{%k1} + vpshufd $177, %zmm21, %zmm0 + vpmaxsd %zmm0, %zmm21, %zmm6 + vpminsd %zmm0, %zmm21, %zmm6{%k1} + vpshufd $177, %zmm19, %zmm0 + vpmaxsd %zmm0, %zmm19, %zmm9 + vpminsd %zmm0, %zmm19, %zmm9{%k1} + vpshufd $177, %zmm20, %zmm0 + vpmaxsd %zmm0, %zmm20, %zmm1 + vpminsd %zmm0, %zmm20, %zmm1{%k1} + vpmaxsd %zmm3, %zmm4, %zmm0 + vpminsd %zmm3, %zmm4, %zmm0{%k1} + vmovdqa64 %zmm9, %zmm3 + cmpq $3, %rsi + jbe .L211 + vpshufd $27, %zmm2, %zmm2 + vpshufd $27, %zmm8, %zmm8 + vpshufd $27, %zmm9, %zmm9 + movl $85, %eax + vpminsd %zmm9, %zmm18, %zmm20 + vpshufd $27, %zmm7, %zmm7 + vpshufd $27, %zmm10, %zmm10 + kmovb %eax, %k2 + vpshufd $27, %zmm6, %zmm6 + vpshufd $27, %zmm1, %zmm1 + vpshufd $27, %zmm0, %zmm0 + vpmaxsd %zmm9, %zmm18, %zmm23 + vpmaxsd %zmm8, %zmm14, %zmm18 + vpminsd %zmm8, %zmm14, %zmm9 + vpminsd %zmm2, %zmm16, %zmm8 + vpminsd %zmm1, %zmm13, %zmm21 + vpminsd %zmm6, %zmm17, %zmm19 + vpshufd $27, %zmm8, %zmm8 + vpminsd %zmm0, %zmm5, %zmm22 + vpmaxsd %zmm0, %zmm5, %zmm4 + vpshufd $27, %zmm9, %zmm9 + vpmaxsd %zmm1, %zmm13, %zmm13 + vpmaxsd %zmm6, %zmm17, %zmm11 + vpshufd $27, %zmm4, %zmm4 + vpminsd %zmm10, %zmm15, %zmm5 + vpminsd %zmm7, %zmm12, %zmm6 + vpshufd $27, %zmm11, %zmm11 + vpmaxsd %zmm2, %zmm16, %zmm0 + vpmaxsd %zmm10, %zmm15, %zmm3 + vpshufd $27, %zmm13, %zmm2 + vpmaxsd %zmm7, %zmm12, %zmm1 + vpshufd $27, %zmm5, %zmm5 + vpshufd $27, %zmm23, %zmm12 + vpminsd %zmm8, %zmm20, %zmm7 + vpshufd $27, %zmm6, %zmm6 + vpminsd %zmm4, %zmm1, %zmm14 + vpmaxsd %zmm8, %zmm20, %zmm16 + vpminsd %zmm2, %zmm3, %zmm17 + vpshufd $27, %zmm7, %zmm7 + vpminsd %zmm12, %zmm0, %zmm10 + vpminsd %zmm6, %zmm22, %zmm13 + vpminsd %zmm9, %zmm19, %zmm8 + vpmaxsd %zmm4, %zmm1, %zmm1 + vpmaxsd %zmm2, %zmm3, %zmm3 + vpminsd %zmm5, %zmm21, %zmm4 + vpshufd $27, %zmm8, %zmm8 + vpmaxsd %zmm5, %zmm21, %zmm5 + vpmaxsd %zmm6, %zmm22, %zmm6 + vpshufd $27, %zmm3, %zmm3 + vpminsd %zmm7, %zmm4, %zmm20 + vpmaxsd %zmm12, %zmm0, %zmm0 + vpshufd $27, %zmm5, %zmm5 + vpmaxsd %zmm9, %zmm19, %zmm9 + vpminsd %zmm11, %zmm18, %zmm2 + vpshufd $27, %zmm6, %zmm6 + vpmaxsd %zmm11, %zmm18, %zmm11 + vpshufd $27, %zmm1, %zmm1 + vpshufd $27, %zmm10, %zmm18 + vpminsd %zmm8, %zmm13, %zmm12 + vpminsd %zmm6, %zmm9, %zmm15 + vpshufd $27, %zmm2, %zmm2 + vpminsd %zmm5, %zmm16, %zmm10 + vpmaxsd %zmm8, %zmm13, %zmm13 + vpmaxsd %zmm7, %zmm4, %zmm4 + vpminsd %zmm18, %zmm17, %zmm19 + vpminsd %zmm1, %zmm11, %zmm7 + vpmaxsd %zmm18, %zmm17, %zmm17 + vpshufd $27, %zmm19, %zmm19 + vpminsd %zmm3, %zmm0, %zmm18 + vpmaxsd %zmm6, %zmm9, %zmm9 + vpmaxsd %zmm1, %zmm11, %zmm1 + vpmaxsd %zmm5, %zmm16, %zmm6 + vpshufd $27, %zmm20, %zmm5 + vpminsd %zmm2, %zmm14, %zmm8 + vpshufd $27, %zmm13, %zmm11 + vpmaxsd %zmm3, %zmm0, %zmm0 + vpminsd %zmm5, %zmm12, %zmm13 + vpshufd $27, %zmm10, %zmm3 + vpmaxsd %zmm5, %zmm12, %zmm5 + vpshufd $27, %zmm9, %zmm9 + vpshufd $27, %zmm18, %zmm18 + vpshufd $27, %zmm1, %zmm1 + vpmaxsd %zmm2, %zmm14, %zmm2 + vpminsd %zmm19, %zmm8, %zmm16 + vpminsd %zmm3, %zmm15, %zmm12 + vpminsd %zmm9, %zmm6, %zmm10 + vpshufd $27, %zmm2, %zmm2 + vpmaxsd %zmm3, %zmm15, %zmm15 + vpminsd %zmm18, %zmm7, %zmm21 + vpmaxsd %zmm9, %zmm6, %zmm3 + vpmaxsd %zmm19, %zmm8, %zmm8 + vpmaxsd %zmm18, %zmm7, %zmm9 + vpminsd %zmm1, %zmm0, %zmm19 + vpshufd $27, %zmm5, %zmm7 + vpmaxsd %zmm1, %zmm0, %zmm0 + vpshufd $27, %zmm13, %zmm1 + vpminsd %zmm11, %zmm4, %zmm14 + vpmaxsd %zmm2, %zmm17, %zmm6 + vpmaxsd %zmm11, %zmm4, %zmm4 + vpminsd %zmm2, %zmm17, %zmm11 + vpminsd %zmm1, %zmm13, %zmm2 + vpmaxsd %zmm1, %zmm13, %zmm13 + vpminsd %zmm7, %zmm5, %zmm1 + vpmaxsd %zmm7, %zmm5, %zmm5 + vmovdqa64 %zmm2, %zmm13{%k2} + vpblendmq %zmm1, %zmm5, %zmm7{%k2} + vpshufd $27, %zmm14, %zmm1 + vpminsd %zmm1, %zmm14, %zmm2 + vpmaxsd %zmm1, %zmm14, %zmm14 + vmovdqa64 %zmm2, %zmm14{%k2} + vpshufd $27, %zmm4, %zmm2 + vpminsd %zmm2, %zmm4, %zmm1 + vpmaxsd %zmm2, %zmm4, %zmm2 + vmovdqa64 %zmm1, %zmm2{%k2} + vpshufd $27, %zmm12, %zmm1 + vpminsd %zmm1, %zmm12, %zmm4 + vpmaxsd %zmm1, %zmm12, %zmm12 + vmovdqa64 %zmm4, %zmm12{%k2} + vpshufd $27, %zmm15, %zmm4 + vpminsd %zmm4, %zmm15, %zmm1 + vpmaxsd %zmm4, %zmm15, %zmm4 + vmovdqa64 %zmm1, %zmm4{%k2} + vpshufd $27, %zmm10, %zmm1 + vpminsd %zmm1, %zmm10, %zmm5 + vpmaxsd %zmm1, %zmm10, %zmm10 + vpshufd $27, %zmm3, %zmm1 + vmovdqa64 %zmm5, %zmm10{%k2} + vpminsd %zmm1, %zmm3, %zmm5 + vpmaxsd %zmm1, %zmm3, %zmm1 + vpshufd $27, %zmm16, %zmm3 + vmovdqa64 %zmm5, %zmm1{%k2} + vpminsd %zmm3, %zmm16, %zmm5 + vpmaxsd %zmm3, %zmm16, %zmm3 + vmovdqa64 %zmm5, %zmm3{%k2} + vpshufd $27, %zmm8, %zmm5 + vpminsd %zmm5, %zmm8, %zmm15 + vpmaxsd %zmm5, %zmm8, %zmm8 + vpshufd $27, %zmm11, %zmm5 + vmovdqa64 %zmm15, %zmm8{%k2} + vpminsd %zmm5, %zmm11, %zmm15 + vpmaxsd %zmm5, %zmm11, %zmm11 + vpshufd $27, %zmm6, %zmm5 + vmovdqa64 %zmm15, %zmm11{%k2} + vpminsd %zmm5, %zmm6, %zmm15 + vpmaxsd %zmm5, %zmm6, %zmm6 + vpshufd $27, %zmm21, %zmm5 + vmovdqa64 %zmm15, %zmm6{%k2} + vpminsd %zmm5, %zmm21, %zmm15 + vpmaxsd %zmm5, %zmm21, %zmm21 + vpshufd $27, %zmm9, %zmm5 + vmovdqa64 %zmm15, %zmm21{%k2} + vpminsd %zmm5, %zmm9, %zmm15 + vpmaxsd %zmm5, %zmm9, %zmm9 + vpshufd $27, %zmm19, %zmm5 + vmovdqa64 %zmm15, %zmm9{%k2} + vpminsd %zmm5, %zmm19, %zmm15 + vpmaxsd %zmm5, %zmm19, %zmm19 + vpshufd $27, %zmm0, %zmm5 + vmovdqa64 %zmm15, %zmm19{%k2} + vpminsd %zmm5, %zmm0, %zmm20 + vpmaxsd %zmm5, %zmm0, %zmm0 + vpblendmq %zmm20, %zmm0, %zmm20{%k2} + vpshufd $177, %zmm13, %zmm0 + vpmaxsd %zmm13, %zmm0, %zmm5 + vpminsd %zmm13, %zmm0, %zmm5{%k1} + vpshufd $177, %zmm7, %zmm0 + vpmaxsd %zmm7, %zmm0, %zmm13 + vpminsd %zmm7, %zmm0, %zmm13{%k1} + vpshufd $177, %zmm14, %zmm0 + vpmaxsd %zmm14, %zmm0, %zmm18 + vpminsd %zmm14, %zmm0, %zmm18{%k1} + vpshufd $177, %zmm2, %zmm0 + vpmaxsd %zmm2, %zmm0, %zmm17 + vpminsd %zmm2, %zmm0, %zmm17{%k1} + vpshufd $177, %zmm12, %zmm0 + vpmaxsd %zmm12, %zmm0, %zmm14 + vpminsd %zmm12, %zmm0, %zmm14{%k1} + vpshufd $177, %zmm4, %zmm0 + vpmaxsd %zmm4, %zmm0, %zmm16 + vpminsd %zmm4, %zmm0, %zmm16{%k1} + vpshufd $177, %zmm10, %zmm0 + vpshufd $177, %zmm20, %zmm4 + vpmaxsd %zmm10, %zmm0, %zmm15 + vpminsd %zmm10, %zmm0, %zmm15{%k1} + vpshufd $177, %zmm1, %zmm0 + vpmaxsd %zmm1, %zmm0, %zmm12 + vpminsd %zmm1, %zmm0, %zmm12{%k1} + vpshufd $177, %zmm3, %zmm0 + vpshufd $177, %zmm11, %zmm1 + vpmaxsd %zmm3, %zmm0, %zmm7 + vpmaxsd %zmm11, %zmm1, %zmm2 + vpminsd %zmm3, %zmm0, %zmm7{%k1} + vpshufd $177, %zmm8, %zmm0 + vpminsd %zmm11, %zmm1, %zmm2{%k1} + vpmaxsd %zmm8, %zmm0, %zmm10 + vpshufd $177, %zmm9, %zmm1 + vpminsd %zmm8, %zmm0, %zmm10{%k1} + vpshufd $177, %zmm6, %zmm0 + vpmaxsd %zmm9, %zmm1, %zmm3 + vpmaxsd %zmm6, %zmm0, %zmm8 + vpminsd %zmm9, %zmm1, %zmm3{%k1} + vpminsd %zmm6, %zmm0, %zmm8{%k1} + vpshufd $177, %zmm21, %zmm0 + vpmaxsd %zmm21, %zmm0, %zmm6 + vpminsd %zmm21, %zmm0, %zmm6{%k1} + vpshufd $177, %zmm19, %zmm0 + vpmaxsd %zmm19, %zmm0, %zmm1 + vpminsd %zmm19, %zmm0, %zmm1{%k1} + vpmaxsd %zmm20, %zmm4, %zmm0 + vpminsd %zmm20, %zmm4, %zmm0{%k1} + cmpq $7, %rsi + jbe .L211 + vmovdqa32 .LC1(%rip), %zmm9 + movl $51, %eax + kmovb %eax, %k3 + vpermd %zmm2, %zmm9, %zmm2 + vpermd %zmm1, %zmm9, %zmm1 + vpermd %zmm7, %zmm9, %zmm7 + vpminsd %zmm1, %zmm13, %zmm21 + vpermd %zmm10, %zmm9, %zmm10 + vpermd %zmm8, %zmm9, %zmm8 + vpermd %zmm6, %zmm9, %zmm6 + vpermd %zmm3, %zmm9, %zmm3 + vpermd %zmm0, %zmm9, %zmm0 + vpminsd %zmm2, %zmm16, %zmm11 + vpmaxsd %zmm1, %zmm13, %zmm1 + vpminsd %zmm3, %zmm18, %zmm20 + vpminsd %zmm6, %zmm17, %zmm19 + vpermd %zmm11, %zmm9, %zmm11 + vpminsd %zmm0, %zmm5, %zmm22 + vpminsd %zmm7, %zmm12, %zmm4 + vpermd %zmm1, %zmm9, %zmm1 + vpmaxsd %zmm6, %zmm17, %zmm6 + vpmaxsd %zmm0, %zmm5, %zmm0 + vpminsd %zmm8, %zmm14, %zmm17 + vpminsd %zmm10, %zmm15, %zmm5 + vpermd %zmm0, %zmm9, %zmm0 + vpmaxsd %zmm10, %zmm15, %zmm10 + vpmaxsd %zmm3, %zmm18, %zmm3 + vpermd %zmm17, %zmm9, %zmm17 + vpminsd %zmm1, %zmm10, %zmm13 + vpmaxsd %zmm8, %zmm14, %zmm8 + vpermd %zmm3, %zmm9, %zmm3 + vpmaxsd %zmm2, %zmm16, %zmm2 + vpmaxsd %zmm7, %zmm12, %zmm7 + vpermd %zmm5, %zmm9, %zmm5 + vpermd %zmm6, %zmm9, %zmm12 + vpermd %zmm4, %zmm9, %zmm6 + vpmaxsd %zmm1, %zmm10, %zmm4 + vpminsd %zmm11, %zmm20, %zmm10 + vpminsd %zmm5, %zmm21, %zmm18 + vpermd %zmm4, %zmm9, %zmm4 + vpminsd %zmm0, %zmm7, %zmm16 + vpmaxsd %zmm11, %zmm20, %zmm1 + vpminsd %zmm6, %zmm22, %zmm14 + vpminsd %zmm3, %zmm2, %zmm15 + vpminsd %zmm17, %zmm19, %zmm11 + vpmaxsd %zmm0, %zmm7, %zmm7 + vpermd %zmm15, %zmm9, %zmm15 + vpmaxsd %zmm3, %zmm2, %zmm0 + vpmaxsd %zmm5, %zmm21, %zmm5 + vpermd %zmm11, %zmm9, %zmm11 + vpmaxsd %zmm17, %zmm19, %zmm3 + vpmaxsd %zmm12, %zmm8, %zmm2 + vpermd %zmm5, %zmm9, %zmm5 + vpminsd %zmm12, %zmm8, %zmm17 + vpmaxsd %zmm6, %zmm22, %zmm6 + vpermd %zmm10, %zmm9, %zmm8 + vpermd %zmm6, %zmm9, %zmm6 + vpermd %zmm17, %zmm9, %zmm17 + vpermd %zmm7, %zmm9, %zmm7 + vpminsd %zmm8, %zmm18, %zmm19 + vpminsd %zmm11, %zmm14, %zmm12 + vpmaxsd %zmm8, %zmm18, %zmm10 + vpmaxsd %zmm11, %zmm14, %zmm14 + vpminsd %zmm6, %zmm3, %zmm8 + vpmaxsd %zmm15, %zmm13, %zmm11 + vpminsd %zmm5, %zmm1, %zmm18 + vpmaxsd %zmm6, %zmm3, %zmm3 + vpmaxsd %zmm5, %zmm1, %zmm1 + vpminsd %zmm17, %zmm16, %zmm6 + vpermd %zmm19, %zmm9, %zmm5 + vpmaxsd %zmm17, %zmm16, %zmm16 + vpminsd %zmm15, %zmm13, %zmm17 + vpermd %zmm18, %zmm9, %zmm18 + vpminsd %zmm7, %zmm2, %zmm13 + vpmaxsd %zmm7, %zmm2, %zmm2 + vpermd %zmm17, %zmm9, %zmm17 + vpermd %zmm2, %zmm9, %zmm2 + vpminsd %zmm4, %zmm0, %zmm15 + vpmaxsd %zmm4, %zmm0, %zmm0 + vpermd %zmm14, %zmm9, %zmm4 + vpminsd %zmm5, %zmm12, %zmm14 + vpminsd %zmm17, %zmm6, %zmm20 + vpermd %zmm3, %zmm9, %zmm3 + vpermd %zmm15, %zmm9, %zmm15 + vpmaxsd %zmm5, %zmm12, %zmm5 + vpmaxsd %zmm17, %zmm6, %zmm6 + vpminsd %zmm2, %zmm0, %zmm17 + vpermd %zmm16, %zmm9, %zmm16 + vpmaxsd %zmm2, %zmm0, %zmm0 + vpermd %zmm14, %zmm9, %zmm2 + vpminsd %zmm4, %zmm10, %zmm12 + vpminsd %zmm18, %zmm8, %zmm7 + vpmaxsd %zmm4, %zmm10, %zmm4 + vpmaxsd %zmm18, %zmm8, %zmm8 + vpminsd %zmm3, %zmm1, %zmm10 + vpminsd %zmm15, %zmm13, %zmm18 + vpmaxsd %zmm3, %zmm1, %zmm1 + vpmaxsd %zmm15, %zmm13, %zmm3 + vpminsd %zmm2, %zmm14, %zmm13 + vpmaxsd %zmm2, %zmm14, %zmm14 + vpermd %zmm5, %zmm9, %zmm2 + vpminsd %zmm16, %zmm11, %zmm19 + vmovdqa64 %zmm13, %zmm14{%k3} + vpminsd %zmm2, %zmm5, %zmm13 + vpmaxsd %zmm2, %zmm5, %zmm5 + vpermd %zmm12, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm5{%k3} + vpmaxsd %zmm16, %zmm11, %zmm11 + vpminsd %zmm2, %zmm12, %zmm13 + vpmaxsd %zmm2, %zmm12, %zmm12 + vpermd %zmm4, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm12{%k3} + vpminsd %zmm2, %zmm4, %zmm13 + vpmaxsd %zmm2, %zmm4, %zmm4 + vpermd %zmm7, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm4{%k3} + vpshufd $78, %zmm5, %zmm16 + vpminsd %zmm2, %zmm7, %zmm13 + vpmaxsd %zmm2, %zmm7, %zmm7 + vpermd %zmm8, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm7{%k3} + vpminsd %zmm2, %zmm8, %zmm13 + vpmaxsd %zmm2, %zmm8, %zmm8 + vpermd %zmm10, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm8{%k3} + vpshufd $78, %zmm12, %zmm15 + vpminsd %zmm2, %zmm10, %zmm13 + vpmaxsd %zmm2, %zmm10, %zmm10 + vpermd %zmm1, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm10{%k3} + vpminsd %zmm2, %zmm1, %zmm13 + vpmaxsd %zmm2, %zmm1, %zmm1 + vpermd %zmm20, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm1{%k3} + vpminsd %zmm2, %zmm20, %zmm13 + vpmaxsd %zmm2, %zmm20, %zmm20 + vpermd %zmm6, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm20{%k3} + vpminsd %zmm2, %zmm6, %zmm13 + vpmaxsd %zmm2, %zmm6, %zmm6 + vpermd %zmm19, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm6{%k3} + vpminsd %zmm2, %zmm19, %zmm13 + vpmaxsd %zmm2, %zmm19, %zmm19 + vpermd %zmm11, %zmm9, %zmm2 + vmovdqa64 %zmm13, %zmm19{%k3} + vpminsd %zmm2, %zmm11, %zmm13 + vpmaxsd %zmm2, %zmm11, %zmm2 + vpermd %zmm18, %zmm9, %zmm11 + vmovdqa64 %zmm13, %zmm2{%k3} + vpminsd %zmm11, %zmm18, %zmm13 + vpmaxsd %zmm11, %zmm18, %zmm18 + vpermd %zmm3, %zmm9, %zmm11 + vmovdqa64 %zmm13, %zmm18{%k3} + vpminsd %zmm11, %zmm3, %zmm13 + vpmaxsd %zmm11, %zmm3, %zmm3 + vpermd %zmm17, %zmm9, %zmm11 + vmovdqa64 %zmm13, %zmm3{%k3} + vpermd %zmm0, %zmm9, %zmm9 + vpminsd %zmm11, %zmm17, %zmm13 + vpmaxsd %zmm11, %zmm17, %zmm17 + vpshufd $78, %zmm2, %zmm21 + vmovdqa64 %zmm13, %zmm17{%k3} + vpshufd $78, %zmm14, %zmm13 + vpminsd %zmm9, %zmm0, %zmm11 + vpmaxsd %zmm9, %zmm0, %zmm0 + vpminsd %zmm14, %zmm13, %zmm9 + vpmaxsd %zmm14, %zmm13, %zmm13 + vpshufd $78, %zmm4, %zmm14 + vmovdqa64 %zmm11, %zmm0{%k3} + vmovdqa64 %zmm9, %zmm13{%k2} + vpminsd %zmm5, %zmm16, %zmm9 + vpmaxsd %zmm5, %zmm16, %zmm16 + vpminsd %zmm12, %zmm15, %zmm5 + vpmaxsd %zmm12, %zmm15, %zmm15 + vpshufd $78, %zmm7, %zmm12 + vmovdqa64 %zmm5, %zmm15{%k2} + vpshufd $78, %zmm8, %zmm11 + vpminsd %zmm4, %zmm14, %zmm5 + vpmaxsd %zmm4, %zmm14, %zmm14 + vpminsd %zmm7, %zmm12, %zmm4 + vmovdqa64 %zmm9, %zmm16{%k2} + vpmaxsd %zmm7, %zmm12, %zmm12 + vpshufd $78, %zmm10, %zmm7 + vmovdqa64 %zmm5, %zmm14{%k2} + vmovdqa64 %zmm4, %zmm12{%k2} + vpminsd %zmm8, %zmm11, %zmm4 + vpmaxsd %zmm8, %zmm11, %zmm11 + vmovdqa64 %zmm4, %zmm11{%k2} + vpminsd %zmm10, %zmm7, %zmm4 + vpmaxsd %zmm10, %zmm7, %zmm7 + vmovdqa64 %zmm4, %zmm7{%k2} + vpshufd $78, %zmm20, %zmm10 + vpshufd $78, %zmm1, %zmm4 + vpminsd %zmm1, %zmm4, %zmm5 + vpshufd $78, %zmm6, %zmm8 + vpmaxsd %zmm1, %zmm4, %zmm1 + vpminsd %zmm20, %zmm10, %zmm4 + vpmaxsd %zmm20, %zmm10, %zmm10 + vpshufd $78, %zmm18, %zmm20 + vmovdqa64 %zmm4, %zmm10{%k2} + vpminsd %zmm6, %zmm8, %zmm4 + vpmaxsd %zmm6, %zmm8, %zmm8 + vpshufd $78, %zmm19, %zmm6 + vmovdqa64 %zmm4, %zmm8{%k2} + vpshufd $78, %zmm0, %zmm9 + vpminsd %zmm19, %zmm6, %zmm4 + vpmaxsd %zmm19, %zmm6, %zmm6 + vpshufd $78, %zmm3, %zmm19 + vmovdqa64 %zmm4, %zmm6{%k2} + vpminsd %zmm2, %zmm21, %zmm4 + vpmaxsd %zmm2, %zmm21, %zmm21 + vpminsd %zmm18, %zmm20, %zmm2 + vpmaxsd %zmm18, %zmm20, %zmm20 + vmovdqa64 %zmm4, %zmm21{%k2} + vmovdqa64 %zmm2, %zmm20{%k2} + vpshufd $78, %zmm17, %zmm4 + vpminsd %zmm3, %zmm19, %zmm2 + vpmaxsd %zmm3, %zmm19, %zmm19 + vmovdqa64 %zmm5, %zmm1{%k2} + vmovdqa64 %zmm2, %zmm19{%k2} + vpminsd %zmm17, %zmm4, %zmm2 + vpmaxsd %zmm17, %zmm4, %zmm4 + vmovdqa64 %zmm2, %zmm4{%k2} + vpminsd %zmm0, %zmm9, %zmm2 + vpmaxsd %zmm0, %zmm9, %zmm9 + vpshufd $177, %zmm13, %zmm0 + vmovdqa64 %zmm2, %zmm9{%k2} + vpmaxsd %zmm13, %zmm0, %zmm5 + vpminsd %zmm13, %zmm0, %zmm5{%k1} + vpshufd $177, %zmm16, %zmm0 + vpmaxsd %zmm16, %zmm0, %zmm13 + vpminsd %zmm16, %zmm0, %zmm13{%k1} + vpshufd $177, %zmm15, %zmm0 + vpmaxsd %zmm15, %zmm0, %zmm18 + vpminsd %zmm15, %zmm0, %zmm18{%k1} + vpshufd $177, %zmm14, %zmm0 + vpmaxsd %zmm14, %zmm0, %zmm17 + vpminsd %zmm14, %zmm0, %zmm17{%k1} + vpshufd $177, %zmm12, %zmm0 + vpmaxsd %zmm12, %zmm0, %zmm14 + vpminsd %zmm12, %zmm0, %zmm14{%k1} + vpshufd $177, %zmm11, %zmm0 + vpmaxsd %zmm11, %zmm0, %zmm16 + vpminsd %zmm11, %zmm0, %zmm16{%k1} + vpshufd $177, %zmm7, %zmm0 + vpshufd $177, %zmm9, %zmm11 + vpmaxsd %zmm7, %zmm0, %zmm15 + vpminsd %zmm7, %zmm0, %zmm15{%k1} + vpshufd $177, %zmm1, %zmm0 + vpmaxsd %zmm1, %zmm0, %zmm12 + vpminsd %zmm1, %zmm0, %zmm12{%k1} + vpshufd $177, %zmm10, %zmm0 + vpshufd $177, %zmm19, %zmm1 + vpmaxsd %zmm10, %zmm0, %zmm7 + vpmaxsd %zmm19, %zmm1, %zmm3 + vpminsd %zmm10, %zmm0, %zmm7{%k1} + vpshufd $177, %zmm8, %zmm0 + vpminsd %zmm19, %zmm1, %zmm3{%k1} + vpmaxsd %zmm8, %zmm0, %zmm10 + vpminsd %zmm8, %zmm0, %zmm10{%k1} + vpshufd $177, %zmm6, %zmm0 + vpmaxsd %zmm6, %zmm0, %zmm2 + vpminsd %zmm6, %zmm0, %zmm2{%k1} + vpshufd $177, %zmm21, %zmm0 + vpmaxsd %zmm21, %zmm0, %zmm8 + vpminsd %zmm21, %zmm0, %zmm8{%k1} + vpshufd $177, %zmm20, %zmm0 + vpmaxsd %zmm20, %zmm0, %zmm6 + vpminsd %zmm20, %zmm0, %zmm6{%k1} + vpshufd $177, %zmm4, %zmm0 + vpmaxsd %zmm4, %zmm0, %zmm1 + vpminsd %zmm4, %zmm0, %zmm1{%k1} + vpmaxsd %zmm9, %zmm11, %zmm0 + vmovdqa32 %zmm0, %zmm4 + vpminsd %zmm9, %zmm11, %zmm4{%k1} + vmovdqa64 %zmm4, %zmm0 + cmpq $15, %rsi + jbe .L211 + vmovdqa32 .LC2(%rip), %zmm0 + movl $65535, %eax + kmovd %eax, %k1 + movl $51, %eax + vpermd %zmm6, %zmm0, %zmm11 + vpermd %zmm10, %zmm0, %zmm10 + vpermd %zmm4, %zmm0, %zmm6 + vpermd %zmm2, %zmm0, %zmm2 + vpermd %zmm1, %zmm0, %zmm1 + vpminsd %zmm6, %zmm5, %zmm20 + vpminsd %zmm1, %zmm13, %zmm19 + vpermd %zmm8, %zmm0, %zmm8 + vpermd %zmm7, %zmm0, %zmm7 + vpermd %zmm3, %zmm0, %zmm3 + vpmaxsd %zmm6, %zmm5, %zmm6 + vpmaxsd %zmm1, %zmm13, %zmm1 + vpminsd %zmm10, %zmm15, %zmm5 + vpminsd %zmm2, %zmm16, %zmm13 + vpermd %zmm6, %zmm0, %zmm6 + vpminsd %zmm3, %zmm18, %zmm9 + vpermd %zmm5, %zmm0, %zmm5 + vpminsd %zmm8, %zmm14, %zmm4 + vpmaxsd %zmm3, %zmm18, %zmm3 + vpminsd %zmm11, %zmm17, %zmm18 + vpermd %zmm1, %zmm0, %zmm1 + vpmaxsd %zmm11, %zmm17, %zmm11 + vpmaxsd %zmm8, %zmm14, %zmm17 + vpermd %zmm3, %zmm0, %zmm3 + vpminsd %zmm7, %zmm12, %zmm8 + vpmaxsd %zmm7, %zmm12, %zmm7 + vpermd %zmm13, %zmm0, %zmm12 + vpmaxsd %zmm10, %zmm15, %zmm10 + vpminsd %zmm6, %zmm7, %zmm13 + vpermd %zmm11, %zmm0, %zmm15 + vpminsd %zmm5, %zmm19, %zmm14 + vpmaxsd %zmm2, %zmm16, %zmm2 + vpermd %zmm4, %zmm0, %zmm4 + vpermd %zmm8, %zmm0, %zmm8 + vpmaxsd %zmm6, %zmm7, %zmm6 + vpmaxsd %zmm5, %zmm19, %zmm7 + vpminsd %zmm12, %zmm9, %zmm19 + vpminsd %zmm8, %zmm20, %zmm16 + vpermd %zmm7, %zmm0, %zmm7 + vpminsd %zmm1, %zmm10, %zmm11 + vpermd %zmm19, %zmm0, %zmm19 + vpmaxsd %zmm1, %zmm10, %zmm5 + vpmaxsd %zmm8, %zmm20, %zmm8 + vpmaxsd %zmm12, %zmm9, %zmm1 + vpermd %zmm5, %zmm0, %zmm5 + vpminsd %zmm3, %zmm2, %zmm12 + vpminsd %zmm4, %zmm18, %zmm9 + vpermd %zmm8, %zmm0, %zmm8 + vpmaxsd %zmm4, %zmm18, %zmm4 + vpminsd %zmm15, %zmm17, %zmm18 + vpermd %zmm9, %zmm0, %zmm9 + vpermd %zmm12, %zmm0, %zmm12 + vpermd %zmm18, %zmm0, %zmm18 + vpermd %zmm6, %zmm0, %zmm6 + vpmaxsd %zmm3, %zmm2, %zmm2 + vpmaxsd %zmm15, %zmm17, %zmm3 + vpminsd %zmm19, %zmm14, %zmm17 + vpminsd %zmm9, %zmm16, %zmm15 + vpmaxsd %zmm9, %zmm16, %zmm10 + vpmaxsd %zmm19, %zmm14, %zmm14 + vpminsd %zmm8, %zmm4, %zmm9 + vpminsd %zmm7, %zmm1, %zmm16 + vpminsd %zmm12, %zmm11, %zmm19 + vpmaxsd %zmm7, %zmm1, %zmm1 + vpermd %zmm16, %zmm0, %zmm16 + vpminsd %zmm18, %zmm13, %zmm7 + vpmaxsd %zmm12, %zmm11, %zmm11 + vpermd %zmm19, %zmm0, %zmm19 + vpminsd %zmm5, %zmm2, %zmm12 + vpmaxsd %zmm8, %zmm4, %zmm4 + vpmaxsd %zmm18, %zmm13, %zmm13 + vpminsd %zmm6, %zmm3, %zmm8 + vpermd %zmm4, %zmm0, %zmm4 + vpmaxsd %zmm6, %zmm3, %zmm3 + vpermd %zmm17, %zmm0, %zmm6 + vpermd %zmm13, %zmm0, %zmm13 + vpermd %zmm12, %zmm0, %zmm12 + vpermd %zmm3, %zmm0, %zmm3 + vpmaxsd %zmm5, %zmm2, %zmm2 + vpermd %zmm10, %zmm0, %zmm5 + vpminsd %zmm6, %zmm15, %zmm10 + vpminsd %zmm16, %zmm9, %zmm17 + vpminsd %zmm5, %zmm14, %zmm18 + vpmaxsd %zmm6, %zmm15, %zmm6 + vpmaxsd %zmm5, %zmm14, %zmm5 + vpmaxsd %zmm16, %zmm9, %zmm9 + vpminsd %zmm13, %zmm11, %zmm14 + vpminsd %zmm4, %zmm1, %zmm16 + vpmaxsd %zmm13, %zmm11, %zmm11 + vpmaxsd %zmm4, %zmm1, %zmm1 + vpminsd %zmm12, %zmm8, %zmm13 + vpmaxsd %zmm12, %zmm8, %zmm4 + vpminsd %zmm3, %zmm2, %zmm8 + vpmaxsd %zmm3, %zmm2, %zmm2 + vpermd %zmm10, %zmm0, %zmm3 + vpminsd %zmm3, %zmm10, %zmm12 + vpmaxsd %zmm3, %zmm10, %zmm10 + vpermd %zmm6, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm10{%k1} + vpminsd %zmm3, %zmm6, %zmm12 + vpmaxsd %zmm3, %zmm6, %zmm6 + vpermd %zmm18, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm6{%k1} + vpminsd %zmm19, %zmm7, %zmm15 + vpminsd %zmm3, %zmm18, %zmm12 + vpmaxsd %zmm3, %zmm18, %zmm18 + vpermd %zmm5, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm18{%k1} + vpminsd %zmm3, %zmm5, %zmm12 + vpmaxsd %zmm3, %zmm5, %zmm5 + vpermd %zmm17, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm5{%k1} + vpmaxsd %zmm19, %zmm7, %zmm7 + vpminsd %zmm3, %zmm17, %zmm12 + vpmaxsd %zmm3, %zmm17, %zmm17 + vpermd %zmm9, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm17{%k1} + vpminsd %zmm3, %zmm9, %zmm12 + vpmaxsd %zmm3, %zmm9, %zmm9 + vpermd %zmm16, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm9{%k1} + vshufi32x4 $177, %zmm5, %zmm5, %zmm22 + vpminsd %zmm3, %zmm16, %zmm12 + vpmaxsd %zmm3, %zmm16, %zmm16 + vpermd %zmm1, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm16{%k1} + vpminsd %zmm3, %zmm1, %zmm12 + vpmaxsd %zmm3, %zmm1, %zmm1 + vpermd %zmm15, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm1{%k1} + vshufi32x4 $177, %zmm17, %zmm17, %zmm21 + vpminsd %zmm3, %zmm15, %zmm12 + vpmaxsd %zmm3, %zmm15, %zmm15 + vpermd %zmm7, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm15{%k1} + vpminsd %zmm3, %zmm7, %zmm12 + vpmaxsd %zmm3, %zmm7, %zmm7 + vpermd %zmm14, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm7{%k1} + vshufi32x4 $177, %zmm9, %zmm9, %zmm20 + vpminsd %zmm3, %zmm14, %zmm12 + vpmaxsd %zmm3, %zmm14, %zmm14 + vpermd %zmm11, %zmm0, %zmm3 + vmovdqu16 %zmm12, %zmm14{%k1} + vpminsd %zmm3, %zmm11, %zmm12 + vpmaxsd %zmm3, %zmm11, %zmm3 + vpermd %zmm13, %zmm0, %zmm11 + vmovdqu16 %zmm12, %zmm3{%k1} + vshufi32x4 $177, %zmm16, %zmm16, %zmm19 + vpminsd %zmm11, %zmm13, %zmm12 + vpmaxsd %zmm11, %zmm13, %zmm13 + vpermd %zmm4, %zmm0, %zmm11 + vmovdqu16 %zmm12, %zmm13{%k1} + vpminsd %zmm11, %zmm4, %zmm12 + vpmaxsd %zmm11, %zmm4, %zmm4 + vpermd %zmm8, %zmm0, %zmm11 + vmovdqu16 %zmm12, %zmm4{%k1} + vpermd %zmm2, %zmm0, %zmm0 + vpminsd %zmm11, %zmm8, %zmm12 + vpmaxsd %zmm11, %zmm8, %zmm8 + vmovdqu16 %zmm12, %zmm8{%k1} + vpminsd %zmm0, %zmm2, %zmm11 + vshufi32x4 $177, %zmm10, %zmm10, %zmm12 + vpmaxsd %zmm0, %zmm2, %zmm0 + vpminsd %zmm10, %zmm12, %zmm2 + vmovdqu16 %zmm11, %zmm0{%k1} + vpmaxsd %zmm10, %zmm12, %zmm12 + kmovb %eax, %k1 + vshufi32x4 $177, %zmm6, %zmm6, %zmm11 + vmovdqa64 %zmm2, %zmm12{%k1} + vpminsd %zmm6, %zmm11, %zmm2 + vpmaxsd %zmm6, %zmm11, %zmm11 + movl $85, %eax + vshufi32x4 $177, %zmm18, %zmm18, %zmm10 + vmovdqa64 %zmm2, %zmm11{%k1} + vshufi32x4 $177, %zmm14, %zmm14, %zmm6 + vpminsd %zmm18, %zmm10, %zmm2 + vpmaxsd %zmm18, %zmm10, %zmm10 + vshufi32x4 $177, %zmm15, %zmm15, %zmm18 + vmovdqa64 %zmm2, %zmm10{%k1} + vpminsd %zmm5, %zmm22, %zmm2 + vpmaxsd %zmm5, %zmm22, %zmm22 + vmovdqa64 %zmm2, %zmm22{%k1} + vpminsd %zmm17, %zmm21, %zmm2 + vpmaxsd %zmm17, %zmm21, %zmm21 + vmovdqa64 %zmm2, %zmm21{%k1} + vpminsd %zmm9, %zmm20, %zmm2 + vpmaxsd %zmm9, %zmm20, %zmm20 + vmovdqa64 %zmm2, %zmm20{%k1} + vpminsd %zmm16, %zmm19, %zmm2 + vpmaxsd %zmm16, %zmm19, %zmm19 + vmovdqa64 %zmm2, %zmm19{%k1} + vshufi32x4 $177, %zmm1, %zmm1, %zmm2 + vshufi32x4 $177, %zmm7, %zmm7, %zmm17 + vpminsd %zmm1, %zmm2, %zmm5 + vpmaxsd %zmm1, %zmm2, %zmm2 + vshufi32x4 $177, %zmm3, %zmm3, %zmm16 + vpminsd %zmm15, %zmm18, %zmm1 + vpmaxsd %zmm15, %zmm18, %zmm18 + vshufi32x4 $177, %zmm13, %zmm13, %zmm15 + vmovdqa64 %zmm1, %zmm18{%k1} + vpminsd %zmm7, %zmm17, %zmm1 + vpmaxsd %zmm7, %zmm17, %zmm17 + vmovdqa64 %zmm1, %zmm17{%k1} + vpminsd %zmm14, %zmm6, %zmm1 + vpmaxsd %zmm14, %zmm6, %zmm6 + vmovdqa64 %zmm1, %zmm6{%k1} + vpminsd %zmm3, %zmm16, %zmm1 + vpmaxsd %zmm3, %zmm16, %zmm16 + vmovdqa64 %zmm1, %zmm16{%k1} + vshufi32x4 $177, %zmm4, %zmm4, %zmm14 + vpminsd %zmm13, %zmm15, %zmm1 + vpmaxsd %zmm13, %zmm15, %zmm15 + vshufi32x4 $177, %zmm8, %zmm8, %zmm9 + vmovdqa64 %zmm5, %zmm2{%k1} + vmovdqa64 %zmm1, %zmm15{%k1} + vpminsd %zmm4, %zmm14, %zmm1 + vpmaxsd %zmm4, %zmm14, %zmm14 + vmovdqa64 %zmm1, %zmm14{%k1} + vshufi32x4 $177, %zmm0, %zmm0, %zmm5 + vpminsd %zmm8, %zmm9, %zmm1 + vpshufd $78, %zmm12, %zmm13 + vpmaxsd %zmm8, %zmm9, %zmm9 + vpshufd $78, %zmm21, %zmm7 + vmovdqa64 %zmm1, %zmm9{%k1} + vpminsd %zmm0, %zmm5, %zmm1 + vpmaxsd %zmm0, %zmm5, %zmm5 + vpminsd %zmm12, %zmm13, %zmm0 + vpmaxsd %zmm12, %zmm13, %zmm13 + vpshufd $78, %zmm11, %zmm12 + vmovdqa64 %zmm1, %zmm5{%k1} + kmovb %eax, %k1 + vmovdqa64 %zmm0, %zmm13{%k1} + vpminsd %zmm11, %zmm12, %zmm0 + vpmaxsd %zmm11, %zmm12, %zmm12 + vpshufd $78, %zmm10, %zmm11 + vpshufd $78, %zmm20, %zmm4 + movl $21845, %eax + vmovdqa64 %zmm0, %zmm12{%k1} + vpminsd %zmm10, %zmm11, %zmm0 + vpmaxsd %zmm10, %zmm11, %zmm11 + vpshufd $78, %zmm22, %zmm10 + vmovdqa64 %zmm0, %zmm11{%k1} + vpshufd $78, %zmm19, %zmm3 + vpminsd %zmm22, %zmm10, %zmm0 + vpmaxsd %zmm22, %zmm10, %zmm10 + vpshufd $78, %zmm2, %zmm1 + vmovdqa64 %zmm0, %zmm10{%k1} + vpminsd %zmm21, %zmm7, %zmm0 + vpmaxsd %zmm21, %zmm7, %zmm7 + vmovdqa64 %zmm0, %zmm7{%k1} + vpminsd %zmm20, %zmm4, %zmm0 + vpmaxsd %zmm20, %zmm4, %zmm4 + vmovdqa64 %zmm0, %zmm4{%k1} + vpminsd %zmm19, %zmm3, %zmm0 + vpmaxsd %zmm19, %zmm3, %zmm3 + vmovdqa64 %zmm0, %zmm3{%k1} + vpminsd %zmm2, %zmm1, %zmm0 + vpmaxsd %zmm2, %zmm1, %zmm1 + vpshufd $78, %zmm18, %zmm2 + vmovdqa64 %zmm0, %zmm1{%k1} + vpshufd $78, %zmm15, %zmm21 + vpminsd %zmm18, %zmm2, %zmm0 + vpmaxsd %zmm18, %zmm2, %zmm2 + vpshufd $78, %zmm14, %zmm20 + vmovdqa64 %zmm0, %zmm2{%k1} + vpshufd $78, %zmm17, %zmm0 + vpshufd $78, %zmm9, %zmm19 + vpminsd %zmm17, %zmm0, %zmm8 + vpmaxsd %zmm17, %zmm0, %zmm0 + vmovdqa64 %zmm8, %zmm0{%k1} + vpshufd $78, %zmm6, %zmm8 + vpminsd %zmm6, %zmm8, %zmm17 + vpmaxsd %zmm6, %zmm8, %zmm8 + vpshufd $78, %zmm16, %zmm6 + vmovdqa64 %zmm17, %zmm8{%k1} + vpminsd %zmm16, %zmm6, %zmm17 + vpmaxsd %zmm16, %zmm6, %zmm6 + vpminsd %zmm15, %zmm21, %zmm16 + vpmaxsd %zmm15, %zmm21, %zmm21 + vmovdqa64 %zmm17, %zmm6{%k1} + vpminsd %zmm14, %zmm20, %zmm15 + vpmaxsd %zmm14, %zmm20, %zmm20 + vmovdqa64 %zmm16, %zmm21{%k1} + vpminsd %zmm9, %zmm19, %zmm14 + vpmaxsd %zmm9, %zmm19, %zmm19 + vpshufd $78, %zmm5, %zmm9 + vmovdqa64 %zmm14, %zmm19{%k1} + vpminsd %zmm5, %zmm9, %zmm14 + vpmaxsd %zmm5, %zmm9, %zmm9 + vmovdqa64 %zmm14, %zmm9{%k1} + vpshufd $177, %zmm13, %zmm14 + vmovdqa64 %zmm15, %zmm20{%k1} + kmovw %eax, %k1 + vpmaxsd %zmm13, %zmm14, %zmm5 + vpminsd %zmm13, %zmm14, %zmm5{%k1} + vpshufd $177, %zmm12, %zmm14 + vpmaxsd %zmm12, %zmm14, %zmm13 + vpminsd %zmm12, %zmm14, %zmm13{%k1} + vpshufd $177, %zmm11, %zmm12 + vpmaxsd %zmm11, %zmm12, %zmm18 + vpminsd %zmm11, %zmm12, %zmm18{%k1} + vpshufd $177, %zmm10, %zmm11 + vpmaxsd %zmm10, %zmm11, %zmm17 + vpminsd %zmm10, %zmm11, %zmm17{%k1} + vpshufd $177, %zmm7, %zmm10 + vpmaxsd %zmm7, %zmm10, %zmm14 + vpminsd %zmm7, %zmm10, %zmm14{%k1} + vpshufd $177, %zmm4, %zmm7 + vpmaxsd %zmm4, %zmm7, %zmm16 + vpminsd %zmm4, %zmm7, %zmm16{%k1} + vpshufd $177, %zmm3, %zmm4 + vpmaxsd %zmm3, %zmm4, %zmm15 + vpminsd %zmm3, %zmm4, %zmm15{%k1} + vpshufd $177, %zmm1, %zmm3 + vpshufd $177, %zmm9, %zmm4 + vpmaxsd %zmm1, %zmm3, %zmm12 + vpminsd %zmm1, %zmm3, %zmm12{%k1} + vpshufd $177, %zmm0, %zmm1 + vpshufd $177, %zmm2, %zmm3 + vpmaxsd %zmm0, %zmm1, %zmm10 + vpmaxsd %zmm2, %zmm3, %zmm7 + vpminsd %zmm0, %zmm1, %zmm10{%k1} + vpshufd $177, %zmm8, %zmm1 + vpshufd $177, %zmm6, %zmm0 + vpminsd %zmm2, %zmm3, %zmm7{%k1} + vpmaxsd %zmm8, %zmm1, %zmm2 + vpminsd %zmm8, %zmm1, %zmm2{%k1} + vpmaxsd %zmm6, %zmm0, %zmm8 + vpshufd $177, %zmm20, %zmm1 + vpminsd %zmm6, %zmm0, %zmm8{%k1} + vpshufd $177, %zmm21, %zmm0 + vpmaxsd %zmm20, %zmm1, %zmm3 + vpmaxsd %zmm21, %zmm0, %zmm6 + vpminsd %zmm20, %zmm1, %zmm3{%k1} + vpminsd %zmm21, %zmm0, %zmm6{%k1} + vpshufd $177, %zmm19, %zmm0 + vpmaxsd %zmm19, %zmm0, %zmm1 + vpminsd %zmm19, %zmm0, %zmm1{%k1} + vpmaxsd %zmm9, %zmm4, %zmm0 + vpminsd %zmm9, %zmm4, %zmm0{%k1} +.L211: + vmovq %xmm26, %rax + vmovdqu64 %zmm5, (%rdi) + vmovdqu64 %zmm13, (%r15) + vmovdqu64 %zmm18, (%r14) + vmovdqu64 %zmm17, 0(%r13) + vmovdqu64 %zmm14, (%r12) + vmovdqu64 %zmm16, (%rbx) + vmovdqu64 %zmm15, (%r11) + vmovdqu64 %zmm12, (%r10) + vmovdqu64 %zmm7, (%r9) + vmovdqu64 %zmm10, (%r8) + vmovdqu64 %zmm2, (%rax) + vmovq %xmm25, %rax + vmovdqu64 %zmm8, (%rax) + vmovq %xmm27, %rax + vmovdqu64 %zmm6, (%rax) + vmovq %xmm24, %rax + vmovdqu64 %zmm3, (%rdx) + vmovdqu64 %zmm1, (%rax) + movq (%rsp), %rax + vmovdqu64 %zmm0, (%rax) + vzeroupper + leaq -40(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE18789: + .size _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0, .-_ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + .section .text._ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, @function +_ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0: +.LFB18790: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + vmovdqa %ymm0, %ymm3 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + movq %rdx, %r15 + pushq %r14 + .cfi_offset 14, -32 + movq %rsi, %r14 + pushq %r13 + pushq %r12 + .cfi_offset 13, -40 + .cfi_offset 12, -48 + movq %rdi, %r12 + pushq %rbx + andq $-32, %rsp + subq $96, %rsp + .cfi_offset 3, -56 + cmpq $7, %rsi + jbe .L231 + vmovdqa %ymm0, %ymm5 + vmovdqa %ymm1, %ymm6 + movl $8, %r13d + xorl %ebx, %ebx + jmp .L222 + .p2align 4,,10 + .p2align 3 +.L218: + vmovmskps %ymm2, %eax + vmovdqu %ymm3, (%rsi) + popcntq %rax, %rax + addq %rax, %rbx + leaq 8(%r13), %rax + cmpq %r14, %rax + ja .L239 + movq %rax, %r13 +.L222: + vpcmpeqd -32(%r12,%r13,4), %ymm6, %ymm0 + vpcmpeqd -32(%r12,%r13,4), %ymm5, %ymm2 + leaq -8(%r13), %rdx + leaq (%r12,%rbx,4), %rsi + vmovdqa %ymm0, %ymm4 + vpor %ymm2, %ymm0, %ymm0 + vmovmskps %ymm0, %eax + cmpl $255, %eax + je .L218 + vpcmpeqd %ymm0, %ymm0, %ymm0 + vpxor %ymm0, %ymm4, %ymm4 + vpandn %ymm4, %ymm2, %ymm2 + vmovmskps %ymm2, %eax + tzcntl %eax, %eax + addq %rdx, %rax + vpbroadcastd (%r12,%rax,4), %ymm0 + leaq 8(%rbx), %rax + vmovdqa %ymm0, (%r15) + cmpq %rdx, %rax + ja .L219 + .p2align 4,,10 + .p2align 3 +.L220: + vmovdqu %ymm1, -32(%r12,%rax,4) + movq %rax, %rbx + addq $8, %rax + cmpq %rax, %rdx + jnb .L220 +.L219: + subq %rbx, %rdx + xorl %eax, %eax + vmovd %edx, %xmm0 + vpbroadcastd %xmm0, %ymm0 + vpcmpgtd .LC3(%rip), %ymm0, %ymm0 + vpmaskmovd %ymm1, %ymm0, (%r12,%rbx,4) +.L216: + leaq -40(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L239: + .cfi_restore_state + movq %r14, %r8 + leaq 0(,%r13,4), %rsi + leaq (%r12,%rbx,4), %r9 + subq %r13, %r8 +.L217: + testq %r8, %r8 + je .L226 + leaq 0(,%r8,4), %rdx + addq %r12, %rsi + movq %rcx, %rdi + movq %r9, 80(%rsp) + movq %r8, 88(%rsp) + vmovdqa %ymm1, (%rsp) + vmovdqa %ymm3, 32(%rsp) + vzeroupper + call memcpy@PLT + movq 88(%rsp), %r8 + movq 80(%rsp), %r9 + vmovdqa 32(%rsp), %ymm3 + vmovdqa (%rsp), %ymm1 + movq %rax, %rcx +.L226: + vmovdqa (%rcx), %ymm4 + vmovdqa .LC3(%rip), %ymm5 + vmovd %r8d, %xmm2 + vpbroadcastd %xmm2, %ymm2 + vpcmpeqd %ymm3, %ymm4, %ymm0 + vpcmpgtd %ymm5, %ymm2, %ymm2 + vpcmpeqd %ymm1, %ymm4, %ymm4 + vpand %ymm0, %ymm2, %ymm7 + vpor %ymm4, %ymm0, %ymm0 + vpcmpeqd %ymm4, %ymm4, %ymm4 + vpxor %ymm2, %ymm4, %ymm6 + vpor %ymm6, %ymm0, %ymm0 + vmovmskps %ymm0, %eax + cmpl $255, %eax + jne .L240 + vmovmskps %ymm7, %edx + vpmaskmovd %ymm3, %ymm2, (%r9) + popcntq %rdx, %rdx + addq %rbx, %rdx + leaq 8(%rdx), %rax + cmpq %r14, %rax + ja .L229 + .p2align 4,,10 + .p2align 3 +.L230: + vmovdqu %ymm1, -32(%r12,%rax,4) + movq %rax, %rdx + addq $8, %rax + cmpq %rax, %r14 + jnb .L230 +.L229: + subq %rdx, %r14 + movl $1, %eax + vmovd %r14d, %xmm0 + vpbroadcastd %xmm0, %ymm0 + vpcmpgtd %ymm5, %ymm0, %ymm0 + vpmaskmovd %ymm1, %ymm0, (%r12,%rdx,4) + leaq -40(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret +.L231: + .cfi_restore_state + movq %rsi, %r8 + movq %rdi, %r9 + xorl %esi, %esi + xorl %ebx, %ebx + xorl %r13d, %r13d + jmp .L217 +.L240: + vpxor %ymm4, %ymm0, %ymm0 + vmovmskps %ymm0, %eax + tzcntl %eax, %eax + addq %r13, %rax + vpbroadcastd (%r12,%rax,4), %ymm0 + leaq 8(%rbx), %rax + vmovdqa %ymm0, (%r15) + cmpq %rax, %r13 + jb .L227 + .p2align 4,,10 + .p2align 3 +.L228: + vmovdqu %ymm1, -32(%r12,%rax,4) + movq %rax, %rbx + leaq 8(%rax), %rax + cmpq %rax, %r13 + jnb .L228 + leaq (%r12,%rbx,4), %r9 +.L227: + subq %rbx, %r13 + xorl %eax, %eax + vmovd %r13d, %xmm0 + vpbroadcastd %xmm0, %ymm0 + vpcmpgtd %ymm5, %ymm0, %ymm0 + vpmaskmovd %ymm1, %ymm0, (%r9) + jmp .L216 + .cfi_endproc +.LFE18790: + .size _ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, .-_ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + .section .text._ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0, @function +_ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0: +.LFB18791: + .cfi_startproc + cmpq %rdx, %rsi + jbe .L254 + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r10 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rax, %rsi + jbe .L254 + movl (%rdi,%rdx,4), %r11d + vmovd %r11d, %xmm4 + vpshufd $0, %xmm4, %xmm0 + jmp .L244 + .p2align 4,,10 + .p2align 3 +.L245: + cmpq %rcx, %rsi + jbe .L254 + movq %rdx, %rax +.L250: + vbroadcastss (%rdi,%r10,8), %xmm1 + vpcmpgtd %xmm3, %xmm1, %xmm1 + vmovmskps %xmm1, %r8d + andl $1, %r8d + jne .L247 +.L246: + cmpq %rdx, %rax + je .L254 + leaq (%rdi,%rax,4), %rdx + movl (%rdx), %ecx + movl %ecx, (%r9) + movl %r11d, (%rdx) + cmpq %rax, %rsi + jbe .L255 + movq %rax, %rdx +.L248: + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r10 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rsi, %rax + jnb .L254 +.L244: + vbroadcastss (%rdi,%rax,4), %xmm1 + leaq (%rdi,%rdx,4), %r9 + vmovdqa %xmm0, %xmm3 + vpcmpgtd %xmm0, %xmm1, %xmm2 + vmovmskps %xmm2, %r8d + andl $1, %r8d + je .L245 + cmpq %rcx, %rsi + jbe .L246 + vmovdqa %xmm1, %xmm3 + jmp .L250 + .p2align 4,,10 + .p2align 3 +.L247: + cmpq %rcx, %rdx + je .L256 + leaq (%rdi,%rcx,4), %rax + movl (%rax), %edx + movl %edx, (%r9) + movq %rcx, %rdx + movl %r11d, (%rax) + jmp .L248 + .p2align 4,,10 + .p2align 3 +.L254: + ret + .p2align 4,,10 + .p2align 3 +.L255: + ret +.L256: + ret + .cfi_endproc +.LFE18791: + .size _ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0, .-_ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + .section .text._ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, @function +_ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0: +.LFB18792: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movdqa %xmm0, %xmm3 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + movq %rdx, %r15 + pushq %r14 + .cfi_offset 14, -32 + movq %rsi, %r14 + pushq %r13 + pushq %r12 + .cfi_offset 13, -40 + .cfi_offset 12, -48 + movq %rdi, %r12 + pushq %rbx + subq $56, %rsp + .cfi_offset 3, -56 + cmpq $3, %rsi + jbe .L288 + movdqa %xmm0, %xmm6 + movdqa %xmm1, %xmm5 + movl $4, %r13d + xorl %ebx, %ebx + jmp .L267 + .p2align 4,,10 + .p2align 3 +.L259: + movmskps %xmm2, %eax + movups %xmm3, (%r12,%rbx,4) + popcntq %rax, %rax + addq %rax, %rbx + leaq 4(%r13), %rax + cmpq %r14, %rax + ja .L344 + movq %rax, %r13 +.L267: + movdqu -16(%r12,%r13,4), %xmm2 + movdqu -16(%r12,%r13,4), %xmm0 + leaq -4(%r13), %rdx + pcmpeqd %xmm5, %xmm0 + pcmpeqd %xmm6, %xmm2 + movdqa %xmm0, %xmm4 + por %xmm2, %xmm0 + movmskps %xmm0, %eax + cmpl $15, %eax + je .L259 + pcmpeqd %xmm0, %xmm0 + pxor %xmm0, %xmm4 + pandn %xmm4, %xmm2 + movmskps %xmm2, %eax + rep bsfl %eax, %eax + cltq + addq %rdx, %rax + movd (%r12,%rax,4), %xmm7 + leaq 4(%rbx), %rax + pshufd $0, %xmm7, %xmm0 + movaps %xmm0, (%r15) + cmpq %rdx, %rax + ja .L260 + .p2align 4,,10 + .p2align 3 +.L261: + movups %xmm1, -16(%r12,%rax,4) + movq %rax, %rbx + addq $4, %rax + cmpq %rdx, %rax + jbe .L261 +.L260: + subq %rbx, %rdx + leaq 0(,%rbx,4), %rcx + movd %edx, %xmm7 + pshufd $0, %xmm7, %xmm0 + pcmpgtd .LC0(%rip), %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L262 + movd %xmm1, (%r12,%rbx,4) +.L262: + pextrd $1, %xmm0, %eax + testl %eax, %eax + je .L263 + pextrd $1, %xmm1, 4(%r12,%rcx) +.L263: + pextrd $2, %xmm0, %eax + testl %eax, %eax + je .L264 + pextrd $2, %xmm1, 8(%r12,%rcx) +.L264: + pextrd $3, %xmm0, %eax + testl %eax, %eax + jne .L345 +.L277: + addq $56, %rsp + xorl %eax, %eax + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L345: + .cfi_restore_state + pextrd $3, %xmm1, 12(%r12,%rcx) + jmp .L277 + .p2align 4,,10 + .p2align 3 +.L344: + movq %r14, %r8 + leaq 0(,%r13,4), %rsi + leaq 0(,%rbx,4), %r9 + subq %r13, %r8 +.L258: + testq %r8, %r8 + je .L271 + leaq 0(,%r8,4), %rdx + movq %rcx, %rdi + addq %r12, %rsi + movq %r9, -64(%rbp) + movq %r8, -56(%rbp) + movaps %xmm1, -96(%rbp) + movaps %xmm3, -80(%rbp) + call memcpy@PLT + movq -56(%rbp), %r8 + movq -64(%rbp), %r9 + movdqa -80(%rbp), %xmm3 + movdqa -96(%rbp), %xmm1 + movq %rax, %rcx +.L271: + movdqa (%rcx), %xmm4 + movd %r8d, %xmm7 + movdqa .LC0(%rip), %xmm5 + pshufd $0, %xmm7, %xmm0 + pcmpgtd %xmm5, %xmm0 + movdqa %xmm4, %xmm2 + pcmpeqd %xmm3, %xmm2 + pcmpeqd %xmm1, %xmm4 + movdqa %xmm0, %xmm7 + pand %xmm2, %xmm7 + por %xmm4, %xmm2 + pcmpeqd %xmm4, %xmm4 + movdqa %xmm4, %xmm6 + pxor %xmm0, %xmm6 + por %xmm6, %xmm2 + movmskps %xmm2, %eax + cmpl $15, %eax + jne .L346 + movd %xmm0, %eax + testl %eax, %eax + je .L278 + movd %xmm3, (%r12,%r9) +.L278: + pextrd $1, %xmm0, %eax + testl %eax, %eax + je .L279 + pextrd $1, %xmm3, 4(%r12,%r9) +.L279: + pextrd $2, %xmm0, %eax + testl %eax, %eax + je .L280 + pextrd $2, %xmm3, 8(%r12,%r9) +.L280: + pextrd $3, %xmm0, %eax + testl %eax, %eax + jne .L347 +.L281: + movmskps %xmm7, %edx + popcntq %rdx, %rdx + addq %rbx, %rdx + leaq 4(%rdx), %rax + cmpq %rax, %r14 + jb .L282 + .p2align 4,,10 + .p2align 3 +.L283: + movups %xmm1, -16(%r12,%rax,4) + movq %rax, %rdx + addq $4, %rax + cmpq %rax, %r14 + jnb .L283 +.L282: + subq %rdx, %r14 + leaq 0(,%rdx,4), %rcx + movd %r14d, %xmm7 + pshufd $0, %xmm7, %xmm0 + pcmpgtd %xmm5, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L284 + movd %xmm1, (%r12,%rdx,4) +.L284: + pextrd $1, %xmm0, %eax + testl %eax, %eax + je .L285 + pextrd $1, %xmm1, 4(%r12,%rcx) +.L285: + pextrd $2, %xmm0, %eax + testl %eax, %eax + je .L286 + pextrd $2, %xmm1, 8(%r12,%rcx) +.L286: + pextrd $3, %xmm0, %eax + testl %eax, %eax + je .L287 + pextrd $3, %xmm1, 12(%r12,%rcx) +.L287: + addq $56, %rsp + movl $1, %eax + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L347: + .cfi_restore_state + pextrd $3, %xmm3, 12(%r12,%r9) + jmp .L281 +.L288: + movq %rsi, %r8 + xorl %r9d, %r9d + xorl %esi, %esi + xorl %ebx, %ebx + xorl %r13d, %r13d + jmp .L258 +.L346: + pxor %xmm4, %xmm2 + movmskps %xmm2, %eax + rep bsfl %eax, %eax + cltq + addq %r13, %rax + movd (%r12,%rax,4), %xmm7 + leaq 4(%rbx), %rax + pshufd $0, %xmm7, %xmm0 + movaps %xmm0, (%r15) + cmpq %rax, %r13 + jb .L272 + .p2align 4,,10 + .p2align 3 +.L273: + movups %xmm1, -16(%r12,%rax,4) + movq %rax, %rbx + leaq 4(%rax), %rax + cmpq %r13, %rax + jbe .L273 + leaq 0(,%rbx,4), %r9 +.L272: + subq %rbx, %r13 + movd %r13d, %xmm7 + pshufd $0, %xmm7, %xmm0 + pcmpgtd %xmm5, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L274 + movd %xmm1, (%r12,%r9) +.L274: + pextrd $1, %xmm0, %eax + testl %eax, %eax + je .L275 + pextrd $1, %xmm1, 4(%r12,%r9) +.L275: + pextrd $2, %xmm0, %eax + testl %eax, %eax + je .L276 + pextrd $2, %xmm1, 8(%r12,%r9) +.L276: + pextrd $3, %xmm0, %eax + testl %eax, %eax + je .L277 + pextrd $3, %xmm1, 12(%r12,%r9) + jmp .L277 + .cfi_endproc +.LFE18792: + .size _ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, .-_ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + .section .text._ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0, @function +_ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0: +.LFB18793: + .cfi_startproc + cmpq %rdx, %rsi + jbe .L348 + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r10 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rax, %rsi + jbe .L348 + movl (%rdi,%rdx,4), %r11d + movd %r11d, %xmm6 + pshufd $0, %xmm6, %xmm0 + jmp .L351 + .p2align 4,,10 + .p2align 3 +.L352: + cmpq %rcx, %rsi + jbe .L348 + movq %rdx, %rax +.L357: + movd (%rdi,%r10,8), %xmm5 + pshufd $0, %xmm5, %xmm1 + pcmpgtd %xmm3, %xmm1 + movmskps %xmm1, %r8d + andl $1, %r8d + jne .L354 +.L353: + cmpq %rdx, %rax + je .L348 + leaq (%rdi,%rax,4), %rdx + movl (%rdx), %ecx + movl %ecx, (%r9) + movl %r11d, (%rdx) + cmpq %rax, %rsi + jbe .L361 + movq %rax, %rdx +.L355: + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r10 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rsi, %rax + jnb .L348 +.L351: + movd (%rdi,%rax,4), %xmm4 + leaq (%rdi,%rdx,4), %r9 + movdqa %xmm0, %xmm3 + pshufd $0, %xmm4, %xmm1 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm0, %xmm2 + movmskps %xmm2, %r8d + andl $1, %r8d + je .L352 + cmpq %rcx, %rsi + jbe .L353 + movdqa %xmm1, %xmm3 + jmp .L357 + .p2align 4,,10 + .p2align 3 +.L354: + cmpq %rcx, %rdx + je .L362 + leaq (%rdi,%rcx,4), %rax + movl (%rax), %edx + movl %edx, (%r9) + movq %rcx, %rdx + movl %r11d, (%rax) + jmp .L355 + .p2align 4,,10 + .p2align 3 +.L348: + ret + .p2align 4,,10 + .p2align 3 +.L361: + ret +.L362: + ret + .cfi_endproc +.LFE18793: + .size _ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0, .-_ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + .section .text._ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0, @function +_ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0: +.LFB18794: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsi, %rax + salq $2, %rax + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + leaq (%rdi,%rax), %r15 + pushq %r14 + .cfi_offset 14, -32 + leaq (%r15,%rax), %r14 + pushq %r13 + .cfi_offset 13, -40 + leaq (%r14,%rax), %r13 + pushq %r12 + .cfi_offset 12, -48 + leaq 0(%r13,%rax), %r12 + pushq %rbx + .cfi_offset 3, -56 + leaq (%r12,%rax), %rbx + leaq (%rbx,%rax), %r11 + leaq (%r11,%rax), %r10 + leaq (%r10,%rax), %r9 + subq $64, %rsp + leaq (%r9,%rax), %r8 + movq %rdi, -56(%rbp) + leaq (%r8,%rax), %rdi + movq %rsi, -128(%rbp) + leaq (%rdi,%rax), %rsi + movdqu (%r15), %xmm4 + movdqu (%r14), %xmm14 + leaq (%rsi,%rax), %rcx + movdqu (%r14), %xmm8 + movdqu (%r11), %xmm3 + movdqu (%r12), %xmm13 + movdqu (%r12), %xmm1 + movq %rcx, -64(%rbp) + addq %rax, %rcx + leaq (%rcx,%rax), %rdx + movdqu (%r11), %xmm7 + movdqu (%r9), %xmm12 + movq %rcx, -136(%rbp) + addq %rdx, %rax + movq %rdx, -112(%rbp) + movdqu (%r9), %xmm6 + movq %rdx, -144(%rbp) + movq -56(%rbp), %rdx + movdqu (%rdi), %xmm11 + movdqu (%rdx), %xmm15 + movdqu (%rdx), %xmm9 + pminsd %xmm4, %xmm15 + pmaxsd %xmm4, %xmm9 + movdqu 0(%r13), %xmm4 + pminsd %xmm4, %xmm14 + pmaxsd %xmm4, %xmm8 + movdqu (%rbx), %xmm4 + pminsd %xmm4, %xmm13 + pmaxsd %xmm4, %xmm1 + movdqu (%r10), %xmm4 + pmaxsd %xmm4, %xmm7 + pminsd %xmm4, %xmm3 + movdqu (%r8), %xmm4 + pmaxsd %xmm4, %xmm6 + pminsd %xmm4, %xmm12 + movdqu (%rsi), %xmm4 + movdqu (%rdi), %xmm5 + movdqu (%rax), %xmm10 + cmpq $1, -128(%rbp) + pminsd %xmm4, %xmm11 + pmaxsd %xmm4, %xmm5 + movdqu (%rcx), %xmm4 + movq -64(%rbp), %rcx + movaps %xmm4, -80(%rbp) + movdqu (%rcx), %xmm4 + movq -112(%rbp), %rcx + movaps %xmm4, -96(%rbp) + movdqa -96(%rbp), %xmm4 + pminsd -80(%rbp), %xmm4 + movdqu (%rcx), %xmm2 + movdqa -96(%rbp), %xmm0 + movaps %xmm4, -160(%rbp) + movdqu (%rcx), %xmm4 + pmaxsd -80(%rbp), %xmm0 + pminsd %xmm10, %xmm2 + pmaxsd %xmm10, %xmm4 + movdqa %xmm15, %xmm10 + pmaxsd %xmm14, %xmm15 + pminsd %xmm14, %xmm10 + movdqa %xmm9, %xmm14 + pmaxsd %xmm8, %xmm9 + movaps %xmm9, -80(%rbp) + movdqa %xmm1, %xmm9 + pminsd %xmm8, %xmm14 + pmaxsd %xmm7, %xmm1 + pminsd %xmm7, %xmm9 + movdqa %xmm13, %xmm8 + movdqa %xmm12, %xmm7 + pminsd %xmm3, %xmm8 + pminsd %xmm11, %xmm7 + pmaxsd %xmm13, %xmm3 + pmaxsd %xmm11, %xmm12 + movdqa %xmm6, %xmm13 + movdqa %xmm6, %xmm11 + movdqa -160(%rbp), %xmm6 + pmaxsd %xmm5, %xmm13 + pminsd %xmm5, %xmm11 + movdqa %xmm6, %xmm5 + pminsd %xmm2, %xmm5 + pmaxsd %xmm6, %xmm2 + movdqa %xmm0, %xmm6 + pminsd %xmm4, %xmm6 + pmaxsd %xmm4, %xmm0 + movdqa %xmm10, %xmm4 + pminsd %xmm8, %xmm4 + pmaxsd %xmm8, %xmm10 + movdqa %xmm14, %xmm8 + pminsd %xmm9, %xmm8 + pmaxsd %xmm9, %xmm14 + movdqa %xmm15, %xmm9 + pmaxsd %xmm3, %xmm15 + pminsd %xmm3, %xmm9 + movdqa -80(%rbp), %xmm3 + movaps %xmm15, -96(%rbp) + movdqa -80(%rbp), %xmm15 + pminsd %xmm1, %xmm3 + pmaxsd %xmm1, %xmm15 + movdqa %xmm7, %xmm1 + pminsd %xmm5, %xmm1 + pmaxsd %xmm7, %xmm5 + movdqa %xmm11, %xmm7 + pminsd %xmm6, %xmm7 + pmaxsd %xmm11, %xmm6 + movdqa %xmm12, %xmm11 + pminsd %xmm2, %xmm11 + pmaxsd %xmm2, %xmm12 + movdqa %xmm13, %xmm2 + pminsd %xmm0, %xmm2 + pmaxsd %xmm13, %xmm0 + movdqa %xmm4, %xmm13 + pminsd %xmm1, %xmm13 + pmaxsd %xmm4, %xmm1 + movaps %xmm13, -80(%rbp) + movdqa %xmm8, %xmm13 + pminsd %xmm7, %xmm13 + pmaxsd %xmm8, %xmm7 + movdqa %xmm9, %xmm8 + pminsd %xmm11, %xmm8 + pmaxsd %xmm9, %xmm11 + movdqa %xmm3, %xmm9 + pminsd %xmm2, %xmm9 + pmaxsd %xmm3, %xmm2 + movdqa %xmm10, %xmm3 + pminsd %xmm5, %xmm3 + pmaxsd %xmm10, %xmm5 + movdqa %xmm14, %xmm10 + pminsd %xmm6, %xmm10 + pmaxsd %xmm14, %xmm6 + movdqa -96(%rbp), %xmm14 + movdqa %xmm14, %xmm4 + pminsd %xmm12, %xmm4 + pmaxsd %xmm14, %xmm12 + movdqa %xmm15, %xmm14 + pminsd %xmm0, %xmm14 + pmaxsd %xmm15, %xmm0 + movdqa %xmm9, %xmm15 + movaps %xmm0, -96(%rbp) + movdqa %xmm10, %xmm0 + pmaxsd %xmm11, %xmm10 + pminsd %xmm5, %xmm15 + pminsd %xmm11, %xmm0 + movdqa %xmm4, %xmm11 + pmaxsd %xmm7, %xmm4 + pminsd %xmm7, %xmm11 + movdqa %xmm14, %xmm7 + pmaxsd %xmm2, %xmm14 + pminsd %xmm2, %xmm7 + movdqa %xmm3, %xmm2 + pmaxsd %xmm1, %xmm3 + pminsd %xmm1, %xmm2 + movdqa %xmm13, %xmm1 + pmaxsd %xmm9, %xmm5 + pminsd %xmm8, %xmm1 + movdqa %xmm6, %xmm9 + pmaxsd %xmm13, %xmm8 + pminsd %xmm12, %xmm9 + pmaxsd %xmm6, %xmm12 + movdqa %xmm1, %xmm6 + pminsd %xmm2, %xmm6 + pmaxsd %xmm2, %xmm1 + movdqa %xmm7, %xmm2 + movaps %xmm6, -112(%rbp) + movdqa %xmm8, %xmm6 + pminsd %xmm9, %xmm2 + pmaxsd %xmm3, %xmm8 + pminsd %xmm3, %xmm6 + pmaxsd %xmm7, %xmm9 + movdqa %xmm2, %xmm13 + movdqa %xmm6, %xmm3 + movdqa %xmm14, %xmm7 + pminsd %xmm5, %xmm13 + pminsd %xmm1, %xmm3 + pminsd %xmm12, %xmm7 + pmaxsd %xmm12, %xmm14 + movdqa %xmm0, %xmm12 + pmaxsd %xmm1, %xmm6 + movaps %xmm3, -176(%rbp) + movdqa %xmm4, %xmm3 + pmaxsd %xmm11, %xmm0 + movdqa %xmm15, %xmm1 + movaps %xmm14, -160(%rbp) + pminsd %xmm10, %xmm3 + pminsd %xmm11, %xmm12 + movdqa %xmm15, %xmm11 + movdqa %xmm13, %xmm15 + pmaxsd %xmm10, %xmm4 + pmaxsd %xmm2, %xmm5 + pminsd %xmm8, %xmm11 + pmaxsd %xmm8, %xmm1 + movdqa %xmm0, %xmm8 + pminsd %xmm3, %xmm15 + pminsd %xmm1, %xmm8 + movdqa %xmm7, %xmm10 + pmaxsd %xmm0, %xmm1 + pmaxsd %xmm3, %xmm13 + movdqa %xmm4, %xmm0 + movdqa %xmm15, %xmm3 + pminsd %xmm9, %xmm10 + pminsd %xmm5, %xmm0 + pminsd %xmm1, %xmm3 + pmaxsd %xmm1, %xmm15 + movdqa %xmm11, %xmm2 + movdqa %xmm13, %xmm1 + pmaxsd %xmm5, %xmm4 + pmaxsd %xmm12, %xmm11 + pminsd %xmm0, %xmm1 + movdqa %xmm11, %xmm5 + movdqa %xmm15, %xmm14 + pminsd %xmm12, %xmm2 + pmaxsd %xmm8, %xmm11 + pmaxsd %xmm0, %xmm13 + movdqa %xmm10, %xmm0 + pmaxsd %xmm9, %xmm7 + pminsd %xmm4, %xmm0 + movdqa %xmm2, %xmm9 + pmaxsd %xmm10, %xmm4 + movdqa %xmm11, %xmm10 + pminsd %xmm6, %xmm9 + pmaxsd %xmm6, %xmm2 + pminsd %xmm8, %xmm5 + pminsd %xmm3, %xmm10 + pmaxsd %xmm3, %xmm11 + pminsd %xmm1, %xmm14 + pmaxsd %xmm1, %xmm15 + jbe .L368 + movdqa -80(%rbp), %xmm3 + pshufd $177, -96(%rbp), %xmm6 + pshufd $177, %xmm7, %xmm7 + pshufd $177, -160(%rbp), %xmm8 + pshufd $177, %xmm4, %xmm4 + pshufd $177, %xmm0, %xmm0 + pshufd $177, %xmm13, %xmm13 + cmpq $3, -128(%rbp) + movdqa %xmm3, %xmm12 + pshufd $177, %xmm15, %xmm15 + pshufd $177, %xmm14, %xmm14 + pminsd %xmm6, %xmm12 + pmaxsd %xmm3, %xmm6 + movdqa -112(%rbp), %xmm3 + movaps %xmm6, -80(%rbp) + movdqa -176(%rbp), %xmm6 + movdqa %xmm3, %xmm1 + pminsd %xmm8, %xmm1 + pmaxsd %xmm3, %xmm8 + movdqa %xmm6, %xmm3 + pminsd %xmm7, %xmm3 + pmaxsd %xmm6, %xmm7 + movdqa %xmm9, %xmm6 + pminsd %xmm4, %xmm6 + pmaxsd %xmm4, %xmm9 + movdqa %xmm2, %xmm4 + pminsd %xmm0, %xmm4 + pmaxsd %xmm0, %xmm2 + movdqa %xmm5, %xmm0 + movaps %xmm6, -96(%rbp) + pmaxsd %xmm13, %xmm5 + pshufd $177, %xmm7, %xmm7 + pshufd $177, %xmm4, %xmm4 + pshufd $177, -80(%rbp), %xmm6 + pminsd %xmm13, %xmm0 + movdqa %xmm10, %xmm13 + pmaxsd %xmm15, %xmm10 + pminsd %xmm15, %xmm13 + movdqa %xmm11, %xmm15 + pmaxsd %xmm14, %xmm11 + pminsd %xmm14, %xmm15 + movdqa %xmm12, %xmm14 + pshufd $177, %xmm13, %xmm13 + pshufd $177, %xmm15, %xmm15 + pshufd $177, %xmm8, %xmm8 + pshufd $177, %xmm0, %xmm0 + pminsd %xmm15, %xmm14 + pmaxsd %xmm15, %xmm12 + movdqa %xmm11, %xmm15 + pminsd %xmm6, %xmm15 + pmaxsd %xmm6, %xmm11 + movdqa %xmm1, %xmm6 + pminsd %xmm13, %xmm6 + pmaxsd %xmm13, %xmm1 + movdqa %xmm10, %xmm13 + movaps %xmm15, -80(%rbp) + pminsd %xmm8, %xmm13 + pmaxsd %xmm8, %xmm10 + movdqa %xmm3, %xmm8 + pminsd %xmm0, %xmm8 + pmaxsd %xmm0, %xmm3 + movdqa %xmm5, %xmm0 + pminsd %xmm7, %xmm0 + pmaxsd %xmm7, %xmm5 + movdqa -96(%rbp), %xmm7 + movdqa %xmm13, %xmm15 + pshufd $177, %xmm9, %xmm9 + pshufd $177, %xmm8, %xmm8 + pshufd $177, %xmm12, %xmm12 + movdqa %xmm7, %xmm13 + pshufd $177, %xmm1, %xmm1 + pshufd $177, %xmm0, %xmm0 + pminsd %xmm4, %xmm13 + pmaxsd %xmm7, %xmm4 + movdqa %xmm2, %xmm7 + pshufd $177, %xmm13, %xmm13 + pmaxsd %xmm9, %xmm2 + pminsd %xmm9, %xmm7 + movdqa %xmm14, %xmm9 + pmaxsd %xmm13, %xmm14 + pshufd $177, %xmm7, %xmm7 + pminsd %xmm13, %xmm9 + movdqa %xmm6, %xmm13 + pmaxsd %xmm8, %xmm6 + pminsd %xmm8, %xmm13 + movdqa %xmm4, %xmm8 + pmaxsd %xmm12, %xmm4 + pminsd %xmm12, %xmm8 + movaps %xmm13, -96(%rbp) + movdqa -80(%rbp), %xmm12 + movdqa %xmm3, %xmm13 + pminsd %xmm1, %xmm13 + pmaxsd %xmm1, %xmm3 + pshufd $177, %xmm11, %xmm11 + movdqa %xmm12, %xmm1 + pshufd $177, %xmm10, %xmm10 + pshufd $177, %xmm4, %xmm4 + pminsd %xmm7, %xmm1 + pmaxsd %xmm12, %xmm7 + movdqa %xmm15, %xmm12 + pminsd %xmm0, %xmm12 + pmaxsd %xmm15, %xmm0 + movdqa %xmm2, %xmm15 + pminsd %xmm11, %xmm15 + pmaxsd %xmm11, %xmm2 + movdqa %xmm5, %xmm11 + pminsd %xmm10, %xmm11 + movaps %xmm15, -80(%rbp) + pmaxsd %xmm10, %xmm5 + movdqa %xmm9, %xmm10 + pshufd $177, %xmm14, %xmm14 + pshufd $177, %xmm7, %xmm7 + pshufd $177, %xmm2, %xmm2 + pshufd $177, -96(%rbp), %xmm15 + pminsd %xmm15, %xmm10 + pmaxsd %xmm15, %xmm9 + movdqa %xmm6, %xmm15 + pminsd %xmm14, %xmm15 + pmaxsd %xmm14, %xmm6 + pshufd $177, %xmm13, %xmm13 + movdqa %xmm8, %xmm14 + pmaxsd %xmm13, %xmm8 + pshufd $177, %xmm12, %xmm12 + pminsd %xmm13, %xmm14 + movdqa %xmm3, %xmm13 + pmaxsd %xmm4, %xmm3 + pminsd %xmm4, %xmm13 + movdqa %xmm1, %xmm4 + pmaxsd %xmm12, %xmm1 + pminsd %xmm12, %xmm4 + movdqa %xmm0, %xmm12 + pmaxsd %xmm7, %xmm0 + movaps %xmm0, -96(%rbp) + movdqa -80(%rbp), %xmm0 + pminsd %xmm7, %xmm12 + pshufd $177, %xmm11, %xmm11 + movdqa %xmm0, %xmm7 + pminsd %xmm11, %xmm7 + pmaxsd %xmm0, %xmm11 + movdqa %xmm5, %xmm0 + pmaxsd %xmm2, %xmm5 + pminsd %xmm2, %xmm0 + pshufd $177, %xmm10, %xmm2 + movaps %xmm5, -112(%rbp) + movdqa %xmm10, %xmm5 + pmaxsd %xmm2, %xmm10 + pminsd %xmm2, %xmm5 + pshufd $177, %xmm9, %xmm2 + movaps %xmm0, -80(%rbp) + blendps $5, %xmm5, %xmm10 + movdqa %xmm9, %xmm5 + pmaxsd %xmm2, %xmm9 + pminsd %xmm2, %xmm5 + movdqa %xmm15, %xmm2 + blendps $5, %xmm5, %xmm9 + pshufd $177, %xmm15, %xmm5 + pminsd %xmm5, %xmm2 + pmaxsd %xmm5, %xmm15 + pshufd $177, %xmm6, %xmm5 + blendps $5, %xmm2, %xmm15 + movdqa %xmm6, %xmm2 + pmaxsd %xmm5, %xmm6 + pminsd %xmm5, %xmm2 + pshufd $177, %xmm14, %xmm5 + blendps $5, %xmm2, %xmm6 + movdqa %xmm14, %xmm2 + pmaxsd %xmm5, %xmm14 + pminsd %xmm5, %xmm2 + movaps %xmm6, -160(%rbp) + movaps %xmm14, %xmm6 + pshufd $177, %xmm8, %xmm5 + blendps $5, %xmm2, %xmm6 + movdqa %xmm8, %xmm2 + pmaxsd %xmm5, %xmm8 + pminsd %xmm5, %xmm2 + movaps %xmm6, -176(%rbp) + movaps %xmm8, %xmm6 + pshufd $177, %xmm13, %xmm5 + blendps $5, %xmm2, %xmm6 + movdqa %xmm13, %xmm2 + pmaxsd %xmm5, %xmm13 + pminsd %xmm5, %xmm2 + movaps %xmm6, -192(%rbp) + movaps %xmm13, %xmm6 + pshufd $177, %xmm3, %xmm5 + blendps $5, %xmm2, %xmm6 + movdqa %xmm3, %xmm2 + pmaxsd %xmm5, %xmm3 + pminsd %xmm5, %xmm2 + movaps %xmm3, %xmm13 + movaps %xmm6, -208(%rbp) + pshufd $177, %xmm4, %xmm6 + pshufd $177, %xmm1, %xmm3 + blendps $5, %xmm2, %xmm13 + movdqa %xmm4, %xmm2 + movdqa -96(%rbp), %xmm5 + pminsd %xmm6, %xmm2 + pmaxsd %xmm4, %xmm6 + pshufd $177, %xmm7, %xmm8 + blendps $5, %xmm2, %xmm6 + movdqa %xmm1, %xmm2 + pmaxsd %xmm3, %xmm1 + pminsd %xmm3, %xmm2 + pshufd $177, %xmm12, %xmm3 + pshufd $177, %xmm5, %xmm14 + blendps $5, %xmm2, %xmm1 + movdqa %xmm12, %xmm2 + movdqa %xmm7, %xmm4 + pminsd %xmm3, %xmm2 + pmaxsd %xmm3, %xmm12 + movdqa %xmm11, %xmm3 + blendps $5, %xmm2, %xmm12 + movdqa %xmm5, %xmm2 + pmaxsd %xmm14, %xmm5 + pminsd %xmm14, %xmm2 + movaps %xmm5, %xmm14 + pminsd %xmm8, %xmm4 + blendps $5, %xmm2, %xmm14 + pshufd $177, %xmm11, %xmm2 + pmaxsd %xmm7, %xmm8 + movdqa -80(%rbp), %xmm7 + pmaxsd %xmm2, %xmm11 + movdqa %xmm4, %xmm0 + pminsd %xmm2, %xmm3 + movaps %xmm8, %xmm4 + movaps %xmm11, %xmm2 + blendps $5, %xmm0, %xmm4 + blendps $5, %xmm3, %xmm2 + movdqa %xmm7, %xmm0 + pshufd $177, %xmm7, %xmm3 + pminsd %xmm3, %xmm0 + pmaxsd %xmm7, %xmm3 + movdqa -112(%rbp), %xmm7 + blendps $5, %xmm0, %xmm3 + pshufd $177, %xmm7, %xmm5 + movdqa %xmm7, %xmm0 + pminsd %xmm5, %xmm0 + pmaxsd %xmm7, %xmm5 + blendps $5, %xmm0, %xmm5 + movaps %xmm5, -224(%rbp) + jbe .L369 + pshufd $27, %xmm2, %xmm7 + pshufd $27, %xmm3, %xmm2 + movaps -160(%rbp), %xmm3 + pshufd $27, %xmm12, %xmm0 + pshufd $27, %xmm4, %xmm8 + pshufd $27, %xmm1, %xmm11 + pshufd $27, %xmm6, %xmm6 + pshufd $27, -224(%rbp), %xmm5 + movdqa %xmm3, %xmm4 + movdqa %xmm5, %xmm12 + pmaxsd %xmm10, %xmm5 + pminsd %xmm10, %xmm12 + pminsd %xmm8, %xmm4 + movdqa %xmm2, %xmm10 + pmaxsd %xmm3, %xmm8 + movaps -176(%rbp), %xmm3 + pshufd $27, %xmm14, %xmm1 + pminsd %xmm9, %xmm10 + pmaxsd %xmm2, %xmm9 + movdqa %xmm7, %xmm2 + movaps %xmm4, -80(%rbp) + movaps -192(%rbp), %xmm14 + pminsd %xmm15, %xmm2 + pmaxsd %xmm15, %xmm7 + movdqa %xmm3, %xmm4 + movaps -208(%rbp), %xmm15 + pminsd %xmm1, %xmm4 + pmaxsd %xmm3, %xmm1 + movdqa %xmm14, %xmm3 + pminsd %xmm0, %xmm3 + pmaxsd %xmm14, %xmm0 + movdqa %xmm15, %xmm14 + pminsd %xmm11, %xmm14 + pmaxsd %xmm15, %xmm11 + movdqa %xmm6, %xmm15 + pminsd %xmm13, %xmm15 + pshufd $27, %xmm5, %xmm5 + pmaxsd %xmm13, %xmm6 + pshufd $27, %xmm15, %xmm15 + movdqa %xmm12, %xmm13 + pshufd $27, %xmm3, %xmm3 + pminsd %xmm15, %xmm13 + pshufd $27, %xmm7, %xmm7 + pshufd $27, %xmm4, %xmm4 + pmaxsd %xmm15, %xmm12 + movdqa %xmm6, %xmm15 + pshufd $27, %xmm14, %xmm14 + pminsd %xmm5, %xmm15 + pmaxsd %xmm5, %xmm6 + movdqa %xmm10, %xmm5 + pshufd $27, %xmm9, %xmm9 + pminsd %xmm14, %xmm5 + movaps %xmm15, -96(%rbp) + pmaxsd %xmm14, %xmm10 + movdqa %xmm11, %xmm14 + pmaxsd %xmm9, %xmm11 + pshufd $27, %xmm8, %xmm8 + pminsd %xmm9, %xmm14 + movdqa %xmm2, %xmm9 + pmaxsd %xmm3, %xmm2 + pminsd %xmm3, %xmm9 + movdqa %xmm0, %xmm3 + pmaxsd %xmm7, %xmm0 + pminsd %xmm7, %xmm3 + movdqa -80(%rbp), %xmm7 + movdqa %xmm14, %xmm15 + pshufd $27, %xmm9, %xmm9 + pshufd $27, %xmm12, %xmm12 + pshufd $27, %xmm3, %xmm3 + pshufd $27, %xmm6, %xmm6 + movdqa %xmm7, %xmm14 + pshufd $27, %xmm11, %xmm11 + pminsd %xmm4, %xmm14 + pmaxsd %xmm7, %xmm4 + movdqa %xmm1, %xmm7 + pminsd %xmm8, %xmm7 + pmaxsd %xmm8, %xmm1 + pshufd $27, %xmm14, %xmm14 + pshufd $27, %xmm10, %xmm8 + movdqa %xmm13, %xmm10 + pshufd $27, %xmm7, %xmm7 + pminsd %xmm14, %xmm10 + pmaxsd %xmm13, %xmm14 + movdqa %xmm5, %xmm13 + pminsd %xmm9, %xmm13 + pmaxsd %xmm9, %xmm5 + movdqa %xmm4, %xmm9 + pminsd %xmm12, %xmm9 + pmaxsd %xmm12, %xmm4 + movdqa -96(%rbp), %xmm12 + movaps %xmm13, -80(%rbp) + movdqa %xmm2, %xmm13 + pmaxsd %xmm8, %xmm2 + pshufd $27, %xmm14, %xmm14 + pminsd %xmm8, %xmm13 + movdqa %xmm12, %xmm8 + pshufd $27, %xmm4, %xmm4 + pminsd %xmm7, %xmm8 + pmaxsd %xmm12, %xmm7 + movdqa %xmm15, %xmm12 + pminsd %xmm3, %xmm12 + pmaxsd %xmm15, %xmm3 + movdqa %xmm1, %xmm15 + pminsd %xmm6, %xmm15 + pmaxsd %xmm6, %xmm1 + movdqa %xmm0, %xmm6 + pminsd %xmm11, %xmm6 + movaps %xmm15, -96(%rbp) + pmaxsd %xmm11, %xmm0 + movdqa %xmm10, %xmm11 + pshufd $27, %xmm13, %xmm13 + pshufd $27, %xmm12, %xmm12 + pshufd $27, %xmm7, %xmm7 + pshufd $27, -80(%rbp), %xmm15 + pminsd %xmm15, %xmm11 + pmaxsd %xmm15, %xmm10 + movdqa %xmm5, %xmm15 + pminsd %xmm14, %xmm15 + pmaxsd %xmm14, %xmm5 + movdqa %xmm9, %xmm14 + pminsd %xmm13, %xmm14 + pmaxsd %xmm13, %xmm9 + movdqa %xmm2, %xmm13 + pmaxsd %xmm4, %xmm2 + pminsd %xmm4, %xmm13 + movdqa %xmm8, %xmm4 + movaps %xmm2, -80(%rbp) + movdqa -96(%rbp), %xmm2 + pminsd %xmm12, %xmm4 + pmaxsd %xmm12, %xmm8 + movdqa %xmm3, %xmm12 + pshufd $27, %xmm1, %xmm1 + pmaxsd %xmm7, %xmm3 + pminsd %xmm7, %xmm12 + pshufd $27, %xmm6, %xmm6 + movdqa %xmm2, %xmm7 + pminsd %xmm6, %xmm7 + pmaxsd %xmm2, %xmm6 + movdqa %xmm0, %xmm2 + pmaxsd %xmm1, %xmm0 + pminsd %xmm1, %xmm2 + movdqa %xmm11, %xmm1 + movaps %xmm0, -112(%rbp) + pshufd $27, %xmm11, %xmm0 + pminsd %xmm0, %xmm1 + pmaxsd %xmm0, %xmm11 + movaps %xmm2, -96(%rbp) + pshufd $27, %xmm10, %xmm0 + movsd %xmm1, %xmm11 + movdqa %xmm10, %xmm1 + pmaxsd %xmm0, %xmm10 + movdqa -80(%rbp), %xmm2 + pminsd %xmm0, %xmm1 + pshufd $27, %xmm15, %xmm0 + movsd %xmm1, %xmm10 + movdqa %xmm15, %xmm1 + pmaxsd %xmm0, %xmm15 + pminsd %xmm0, %xmm1 + pshufd $27, %xmm5, %xmm0 + movsd %xmm1, %xmm15 + movdqa %xmm5, %xmm1 + pmaxsd %xmm0, %xmm5 + pminsd %xmm0, %xmm1 + pshufd $27, %xmm14, %xmm0 + movsd %xmm1, %xmm5 + movdqa %xmm14, %xmm1 + pmaxsd %xmm0, %xmm14 + pminsd %xmm0, %xmm1 + pshufd $27, %xmm9, %xmm0 + movsd %xmm1, %xmm14 + movdqa %xmm9, %xmm1 + pmaxsd %xmm0, %xmm9 + pminsd %xmm0, %xmm1 + pshufd $27, %xmm13, %xmm0 + movsd %xmm1, %xmm9 + movdqa %xmm13, %xmm1 + pmaxsd %xmm0, %xmm13 + pminsd %xmm0, %xmm1 + pshufd $27, %xmm2, %xmm0 + movsd %xmm1, %xmm13 + movdqa %xmm2, %xmm1 + pmaxsd %xmm0, %xmm2 + pminsd %xmm0, %xmm1 + pshufd $27, %xmm4, %xmm0 + movsd %xmm1, %xmm2 + movdqa %xmm4, %xmm1 + pmaxsd %xmm0, %xmm4 + pminsd %xmm0, %xmm1 + pshufd $27, %xmm8, %xmm0 + movsd %xmm1, %xmm4 + movdqa %xmm8, %xmm1 + pmaxsd %xmm0, %xmm8 + pminsd %xmm0, %xmm1 + pshufd $27, %xmm12, %xmm0 + movsd %xmm1, %xmm8 + movdqa %xmm12, %xmm1 + pmaxsd %xmm0, %xmm12 + pminsd %xmm0, %xmm1 + pshufd $27, %xmm3, %xmm0 + movsd %xmm1, %xmm12 + movdqa %xmm3, %xmm1 + pmaxsd %xmm0, %xmm3 + pminsd %xmm0, %xmm1 + pshufd $27, %xmm7, %xmm0 + movsd %xmm1, %xmm3 + movdqa %xmm7, %xmm1 + pmaxsd %xmm0, %xmm7 + pminsd %xmm0, %xmm1 + pshufd $27, %xmm6, %xmm0 + movsd %xmm1, %xmm7 + movdqa %xmm6, %xmm1 + pmaxsd %xmm0, %xmm6 + pminsd %xmm0, %xmm1 + movapd %xmm6, %xmm0 + movdqa -96(%rbp), %xmm6 + movsd %xmm1, %xmm0 + movaps %xmm0, -176(%rbp) + movdqa %xmm6, %xmm1 + pshufd $27, %xmm6, %xmm0 + pminsd %xmm0, %xmm1 + pmaxsd %xmm6, %xmm0 + movapd %xmm0, %xmm6 + movsd %xmm1, %xmm6 + movaps %xmm6, -192(%rbp) + movdqa -112(%rbp), %xmm6 + pshufd $27, %xmm6, %xmm0 + movdqa %xmm6, %xmm1 + pminsd %xmm0, %xmm1 + pmaxsd %xmm6, %xmm0 + movapd %xmm0, %xmm6 + pshufd $177, %xmm11, %xmm0 + movsd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + pmaxsd %xmm11, %xmm0 + pminsd %xmm11, %xmm1 + movaps %xmm0, %xmm11 + pshufd $177, %xmm10, %xmm0 + movaps %xmm6, -208(%rbp) + blendps $5, %xmm1, %xmm11 + movdqa %xmm0, %xmm1 + pshufd $177, %xmm4, %xmm6 + pmaxsd %xmm10, %xmm0 + pminsd %xmm10, %xmm1 + movaps %xmm11, -80(%rbp) + movaps %xmm0, %xmm10 + blendps $5, %xmm1, %xmm10 + pshufd $177, %xmm15, %xmm1 + movdqa %xmm1, %xmm0 + pmaxsd %xmm15, %xmm1 + movaps %xmm10, -96(%rbp) + pshufd $177, %xmm13, %xmm10 + pminsd %xmm15, %xmm0 + pshufd $177, %xmm8, %xmm15 + blendps $5, %xmm0, %xmm1 + movaps %xmm1, -112(%rbp) + pshufd $177, %xmm5, %xmm1 + movdqa %xmm1, %xmm0 + pmaxsd %xmm5, %xmm1 + pminsd %xmm5, %xmm0 + movaps %xmm1, %xmm5 + pshufd $177, %xmm14, %xmm1 + blendps $5, %xmm0, %xmm5 + movaps %xmm5, -128(%rbp) + movdqa %xmm1, %xmm5 + pmaxsd %xmm14, %xmm1 + pminsd %xmm14, %xmm5 + movdqa %xmm5, %xmm0 + movaps %xmm1, %xmm5 + pshufd $177, %xmm9, %xmm1 + blendps $5, %xmm0, %xmm5 + movaps %xmm5, -160(%rbp) + movdqa %xmm1, %xmm5 + pmaxsd %xmm9, %xmm1 + pminsd %xmm9, %xmm5 + movdqa %xmm5, %xmm0 + movaps %xmm1, %xmm5 + blendps $5, %xmm0, %xmm5 + movdqa %xmm10, %xmm0 + pmaxsd %xmm13, %xmm10 + pminsd %xmm13, %xmm0 + pshufd $177, %xmm2, %xmm13 + blendps $5, %xmm0, %xmm10 + movdqa %xmm13, %xmm0 + pmaxsd %xmm2, %xmm13 + pminsd %xmm2, %xmm0 + movaps %xmm13, %xmm1 + pshufd $177, %xmm12, %xmm13 + movapd -192(%rbp), %xmm2 + blendps $5, %xmm0, %xmm1 + movdqa %xmm6, %xmm0 + pmaxsd %xmm4, %xmm6 + pminsd %xmm4, %xmm0 + movdqa %xmm15, %xmm4 + pmaxsd %xmm8, %xmm15 + pminsd %xmm8, %xmm4 + blendps $5, %xmm0, %xmm6 + movaps %xmm1, %xmm9 + pshufd $177, %xmm3, %xmm0 + blendps $5, %xmm4, %xmm15 + movdqa %xmm13, %xmm1 + movdqa %xmm0, %xmm4 + pmaxsd %xmm3, %xmm0 + pminsd %xmm12, %xmm1 + pminsd %xmm3, %xmm4 + pmaxsd %xmm12, %xmm13 + movapd -176(%rbp), %xmm3 + blendps $5, %xmm4, %xmm0 + pshufd $177, %xmm7, %xmm4 + blendps $5, %xmm1, %xmm13 + movdqa %xmm4, %xmm1 + pmaxsd %xmm7, %xmm4 + pminsd %xmm7, %xmm1 + pshufd $177, %xmm3, %xmm7 + blendps $5, %xmm1, %xmm4 + movdqa %xmm3, %xmm1 + pminsd %xmm7, %xmm1 + pmaxsd %xmm3, %xmm7 + pshufd $177, %xmm2, %xmm3 + blendps $5, %xmm1, %xmm7 + movdqa %xmm2, %xmm1 + pminsd %xmm3, %xmm1 + pmaxsd %xmm2, %xmm3 + movapd -208(%rbp), %xmm2 + blendps $5, %xmm1, %xmm3 + pshufd $177, %xmm2, %xmm8 + movdqa %xmm2, %xmm1 + pminsd %xmm8, %xmm1 + pmaxsd %xmm2, %xmm8 + blendps $5, %xmm1, %xmm8 +.L365: + movdqa -80(%rbp), %xmm2 + movups %xmm2, (%rdx) + movdqa -96(%rbp), %xmm2 + movq -64(%rbp), %rdx + movups %xmm2, (%r15) + movdqa -112(%rbp), %xmm2 + movups %xmm2, (%r14) + movdqa -128(%rbp), %xmm2 + movups %xmm2, 0(%r13) + movdqa -160(%rbp), %xmm2 + movups %xmm2, (%r12) + movups %xmm5, (%rbx) + movq -136(%rbp), %rbx + movups %xmm10, (%r11) + movups %xmm9, (%r10) + movups %xmm6, (%r9) + movups %xmm15, (%r8) + movups %xmm13, (%rdi) + movups %xmm0, (%rsi) + movups %xmm4, (%rdx) + movups %xmm7, (%rbx) + movq -144(%rbp), %rbx + movups %xmm3, (%rbx) + movups %xmm8, (%rax) + addq $64, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L368: + .cfi_restore_state + movdqa -112(%rbp), %xmm3 + movdqa -96(%rbp), %xmm8 + movaps %xmm9, -128(%rbp) + movdqa %xmm11, %xmm9 + movdqa -176(%rbp), %xmm6 + movaps %xmm3, -96(%rbp) + movdqa -160(%rbp), %xmm3 + movaps %xmm6, -112(%rbp) + movdqa %xmm14, %xmm6 + movaps %xmm2, -160(%rbp) + jmp .L365 + .p2align 4,,10 + .p2align 3 +.L369: + movaps %xmm10, -80(%rbp) + movdqa %xmm14, %xmm0 + movdqa %xmm2, %xmm7 + movdqa -160(%rbp), %xmm5 + movdqa -208(%rbp), %xmm10 + movaps %xmm9, -96(%rbp) + movdqa -224(%rbp), %xmm8 + movdqa %xmm13, %xmm9 + movaps %xmm5, -128(%rbp) + movdqa -176(%rbp), %xmm5 + movdqa %xmm12, %xmm13 + movaps %xmm15, -112(%rbp) + movdqa %xmm1, %xmm15 + movaps %xmm5, -160(%rbp) + movdqa -192(%rbp), %xmm5 + jmp .L365 + .cfi_endproc +.LFE18794: + .size _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0, .-_ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + .section .text._ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, @function +_ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0: +.LFB18795: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + movq %rcx, %r14 + pushq %r13 + .cfi_offset 13, -40 + movq %rsi, %r13 + pushq %r12 + .cfi_offset 12, -48 + movq %rdi, %r12 + pushq %rbx + subq $88, %rsp + .cfi_offset 3, -56 + movq %rdx, -120(%rbp) + movaps %xmm0, -80(%rbp) + movaps %xmm1, -64(%rbp) + movaps %xmm0, -112(%rbp) + movaps %xmm1, -96(%rbp) + cmpq $3, %rsi + jbe .L401 + movl $4, %r15d + xorl %ebx, %ebx + jmp .L380 + .p2align 4,,10 + .p2align 3 +.L372: + movdqa -80(%rbp), %xmm5 + movmskps %xmm1, %edi + movups %xmm5, (%r12,%rbx,4) + call __popcountdi2@PLT + cltq + addq %rax, %rbx + leaq 4(%r15), %rax + cmpq %r13, %rax + ja .L457 + movq %rax, %r15 +.L380: + movdqu -16(%r12,%r15,4), %xmm1 + movdqu -16(%r12,%r15,4), %xmm0 + leaq -4(%r15), %rdx + pcmpeqd -96(%rbp), %xmm0 + pcmpeqd -112(%rbp), %xmm1 + movdqa %xmm0, %xmm2 + por %xmm1, %xmm0 + movmskps %xmm0, %eax + cmpl $15, %eax + je .L372 + pcmpeqd %xmm0, %xmm0 + pxor %xmm0, %xmm2 + pandn %xmm2, %xmm1 + movmskps %xmm1, %eax + rep bsfl %eax, %eax + cltq + addq %rdx, %rax + movd (%r12,%rax,4), %xmm3 + movq -120(%rbp), %rax + pshufd $0, %xmm3, %xmm0 + movaps %xmm0, (%rax) + leaq 4(%rbx), %rax + cmpq %rdx, %rax + ja .L373 + .p2align 4,,10 + .p2align 3 +.L374: + movdqa -64(%rbp), %xmm4 + movq %rax, %rbx + movups %xmm4, -16(%r12,%rax,4) + addq $4, %rax + cmpq %rdx, %rax + jbe .L374 +.L373: + subq %rbx, %rdx + leaq 0(,%rbx,4), %rcx + movd %edx, %xmm3 + pshufd $0, %xmm3, %xmm0 + pcmpgtd .LC0(%rip), %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L375 + movdqa -64(%rbp), %xmm3 + movd %xmm3, (%r12,%rbx,4) +.L375: + pshufd $85, %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L376 + pshufd $85, -64(%rbp), %xmm1 + movd %xmm1, 4(%r12,%rcx) +.L376: + movdqa %xmm0, %xmm1 + punpckhdq %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L377 + movdqa -64(%rbp), %xmm3 + movdqa %xmm3, %xmm1 + punpckhdq %xmm3, %xmm1 + movd %xmm1, 8(%r12,%rcx) +.L377: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + jne .L458 +.L390: + addq $88, %rsp + xorl %eax, %eax + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L458: + .cfi_restore_state + pshufd $255, -64(%rbp), %xmm0 + movd %xmm0, 12(%r12,%rcx) + jmp .L390 + .p2align 4,,10 + .p2align 3 +.L457: + movq %r13, %r8 + leaq 0(,%r15,4), %rsi + leaq 0(,%rbx,4), %r9 + subq %r15, %r8 +.L371: + testq %r8, %r8 + je .L384 + leaq 0(,%r8,4), %rdx + addq %r12, %rsi + movq %r14, %rdi + movq %r9, -112(%rbp) + movq %r8, -96(%rbp) + call memcpy@PLT + movq -96(%rbp), %r8 + movq -112(%rbp), %r9 +.L384: + movd %r8d, %xmm3 + movdqa (%r14), %xmm2 + movdqa -80(%rbp), %xmm1 + pshufd $0, %xmm3, %xmm0 + movdqa .LC0(%rip), %xmm3 + pcmpeqd %xmm2, %xmm1 + pcmpeqd -64(%rbp), %xmm2 + pcmpgtd %xmm3, %xmm0 + movdqa %xmm0, %xmm5 + pand %xmm1, %xmm5 + por %xmm2, %xmm1 + pcmpeqd %xmm2, %xmm2 + movdqa %xmm2, %xmm4 + pxor %xmm0, %xmm4 + por %xmm4, %xmm1 + movmskps %xmm1, %eax + cmpl $15, %eax + jne .L459 + movd %xmm0, %eax + testl %eax, %eax + je .L391 + movdqa -80(%rbp), %xmm4 + movd %xmm4, (%r12,%r9) +.L391: + pshufd $85, %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L392 + pshufd $85, -80(%rbp), %xmm1 + movd %xmm1, 4(%r12,%r9) +.L392: + movdqa %xmm0, %xmm1 + punpckhdq %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L393 + movdqa -80(%rbp), %xmm7 + movdqa %xmm7, %xmm1 + punpckhdq %xmm7, %xmm1 + movd %xmm1, 8(%r12,%r9) +.L393: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + jne .L460 +.L394: + movmskps %xmm5, %edi + call __popcountdi2@PLT + movdqa .LC0(%rip), %xmm3 + movslq %eax, %rdx + addq %rbx, %rdx + leaq 4(%rdx), %rax + cmpq %rax, %r13 + jb .L395 + .p2align 4,,10 + .p2align 3 +.L396: + movdqa -64(%rbp), %xmm2 + movq %rax, %rdx + movups %xmm2, -16(%r12,%rax,4) + addq $4, %rax + cmpq %rax, %r13 + jnb .L396 +.L395: + subq %rdx, %r13 + leaq 0(,%rdx,4), %rcx + movd %r13d, %xmm4 + pshufd $0, %xmm4, %xmm0 + pcmpgtd %xmm3, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L397 + movdqa -64(%rbp), %xmm3 + movd %xmm3, (%r12,%rdx,4) +.L397: + pshufd $85, %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L398 + pshufd $85, -64(%rbp), %xmm1 + movd %xmm1, 4(%r12,%rcx) +.L398: + movdqa %xmm0, %xmm1 + punpckhdq %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L399 + movdqa -64(%rbp), %xmm3 + movdqa %xmm3, %xmm1 + punpckhdq %xmm3, %xmm1 + movd %xmm1, 8(%r12,%rcx) +.L399: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L400 + pshufd $255, -64(%rbp), %xmm0 + movd %xmm0, 12(%r12,%rcx) +.L400: + addq $88, %rsp + movl $1, %eax + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L460: + .cfi_restore_state + pshufd $255, -80(%rbp), %xmm0 + movd %xmm0, 12(%r12,%r9) + jmp .L394 +.L401: + movq %rsi, %r8 + xorl %r9d, %r9d + xorl %esi, %esi + xorl %ebx, %ebx + xorl %r15d, %r15d + jmp .L371 +.L459: + pxor %xmm2, %xmm1 + movmskps %xmm1, %eax + rep bsfl %eax, %eax + cltq + addq %r15, %rax + movd (%r12,%rax,4), %xmm4 + movq -120(%rbp), %rax + pshufd $0, %xmm4, %xmm0 + movaps %xmm0, (%rax) + leaq 4(%rbx), %rax + cmpq %rax, %r15 + jb .L385 + .p2align 4,,10 + .p2align 3 +.L386: + movdqa -64(%rbp), %xmm6 + movq %rax, %rbx + movups %xmm6, -16(%r12,%rax,4) + leaq 4(%rax), %rax + cmpq %r15, %rax + jbe .L386 + leaq 0(,%rbx,4), %r9 +.L385: + movq %r15, %rcx + subq %rbx, %rcx + movd %ecx, %xmm4 + pshufd $0, %xmm4, %xmm0 + pcmpgtd %xmm3, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L387 + movdqa -64(%rbp), %xmm3 + movd %xmm3, (%r12,%r9) +.L387: + pshufd $85, %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L388 + pshufd $85, -64(%rbp), %xmm1 + movd %xmm1, 4(%r12,%r9) +.L388: + movdqa %xmm0, %xmm1 + punpckhdq %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L389 + movdqa -64(%rbp), %xmm3 + movdqa %xmm3, %xmm1 + punpckhdq %xmm3, %xmm1 + movd %xmm1, 8(%r12,%r9) +.L389: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L390 + pshufd $255, -64(%rbp), %xmm0 + movd %xmm0, 12(%r12,%r9) + jmp .L390 + .cfi_endproc +.LFE18795: + .size _ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, .-_ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + .section .text._ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0, @function +_ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0: +.LFB18796: + .cfi_startproc + cmpq %rdx, %rsi + jbe .L461 + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r10 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rax, %rsi + jbe .L461 + movl (%rdi,%rdx,4), %r11d + movd %r11d, %xmm6 + pshufd $0, %xmm6, %xmm0 + jmp .L464 + .p2align 4,,10 + .p2align 3 +.L465: + cmpq %rcx, %rsi + jbe .L461 + movq %rdx, %rax +.L470: + movd (%rdi,%r10,8), %xmm5 + pshufd $0, %xmm5, %xmm1 + pcmpgtd %xmm3, %xmm1 + movmskps %xmm1, %r8d + andl $1, %r8d + jne .L467 +.L466: + cmpq %rdx, %rax + je .L461 + leaq (%rdi,%rax,4), %rdx + movl (%rdx), %ecx + movl %ecx, (%r9) + movl %r11d, (%rdx) + cmpq %rax, %rsi + jbe .L474 + movq %rax, %rdx +.L468: + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r10 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rsi, %rax + jnb .L461 +.L464: + movd (%rdi,%rax,4), %xmm4 + leaq (%rdi,%rdx,4), %r9 + movdqa %xmm0, %xmm3 + pshufd $0, %xmm4, %xmm1 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm0, %xmm2 + movmskps %xmm2, %r8d + andl $1, %r8d + je .L465 + cmpq %rcx, %rsi + jbe .L466 + movdqa %xmm1, %xmm3 + jmp .L470 + .p2align 4,,10 + .p2align 3 +.L467: + cmpq %rcx, %rdx + je .L475 + leaq (%rdi,%rcx,4), %rax + movl (%rax), %edx + movl %edx, (%r9) + movq %rcx, %rdx + movl %r11d, (%rax) + jmp .L468 + .p2align 4,,10 + .p2align 3 +.L461: + ret + .p2align 4,,10 + .p2align 3 +.L474: + ret +.L475: + ret + .cfi_endproc +.LFE18796: + .size _ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0, .-_ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + .section .text._ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0, @function +_ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0: +.LFB18797: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsi, %rax + salq $2, %rax + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + leaq (%rdi,%rax), %r15 + pushq %r14 + .cfi_offset 14, -32 + leaq (%r15,%rax), %r14 + pushq %r13 + .cfi_offset 13, -40 + leaq (%r14,%rax), %r13 + pushq %r12 + .cfi_offset 12, -48 + leaq 0(%r13,%rax), %r12 + pushq %rbx + .cfi_offset 3, -56 + leaq (%r12,%rax), %rbx + leaq (%rbx,%rax), %r11 + leaq (%r11,%rax), %r10 + subq $240, %rsp + leaq (%r10,%rax), %r9 + leaq (%r9,%rax), %r8 + movq %rdi, -264(%rbp) + movq %rsi, -240(%rbp) + movdqu (%r15), %xmm6 + movdqu (%rdi), %xmm12 + leaq (%r8,%rax), %rdi + leaq (%rdi,%rax), %rsi + movdqu 0(%r13), %xmm5 + movdqu (%r14), %xmm14 + movdqa %xmm6, %xmm8 + leaq (%rsi,%rax), %rcx + movdqu (%rbx), %xmm3 + movdqu (%r12), %xmm11 + pcmpgtd %xmm12, %xmm8 + leaq (%rcx,%rax), %rdx + movdqu (%r10), %xmm2 + movdqu (%r11), %xmm10 + movdqu (%rdx), %xmm0 + movdqu (%rsi), %xmm4 + movq %rdx, -248(%rbp) + addq %rax, %rdx + movdqu (%rdx), %xmm15 + movdqu (%r8), %xmm1 + addq %rdx, %rax + movq %rdx, -256(%rbp) + movdqa %xmm8, %xmm13 + movdqu (%r9), %xmm7 + movdqu (%rdi), %xmm9 + pandn %xmm6, %xmm13 + movaps %xmm15, -112(%rbp) + pand %xmm8, %xmm6 + movdqa %xmm13, %xmm15 + movdqa %xmm12, %xmm13 + pand %xmm8, %xmm13 + por %xmm15, %xmm13 + movdqa %xmm8, %xmm15 + pandn %xmm12, %xmm15 + movdqa %xmm5, %xmm12 + pcmpgtd %xmm14, %xmm12 + por %xmm15, %xmm6 + movdqa %xmm12, %xmm8 + pandn %xmm5, %xmm8 + pand %xmm12, %xmm5 + movdqa %xmm8, %xmm15 + movdqa %xmm14, %xmm8 + pand %xmm12, %xmm8 + por %xmm15, %xmm8 + movdqa %xmm12, %xmm15 + pandn %xmm14, %xmm15 + movdqa %xmm3, %xmm14 + pcmpgtd %xmm11, %xmm14 + por %xmm15, %xmm5 + movdqa %xmm14, %xmm12 + pandn %xmm3, %xmm12 + pand %xmm14, %xmm3 + movdqa %xmm12, %xmm15 + movdqa %xmm11, %xmm12 + pand %xmm14, %xmm12 + por %xmm15, %xmm12 + movdqa %xmm14, %xmm15 + pandn %xmm11, %xmm15 + movdqa %xmm2, %xmm11 + pcmpgtd %xmm10, %xmm11 + por %xmm15, %xmm3 + movaps %xmm3, -64(%rbp) + movdqa %xmm10, %xmm3 + movdqa %xmm11, %xmm14 + pand %xmm11, %xmm3 + pandn %xmm2, %xmm14 + pand %xmm11, %xmm2 + por %xmm14, %xmm3 + movdqa %xmm11, %xmm14 + pandn %xmm10, %xmm14 + movdqa %xmm1, %xmm10 + pcmpgtd %xmm7, %xmm10 + por %xmm14, %xmm2 + movdqa %xmm10, %xmm11 + pandn %xmm1, %xmm11 + pand %xmm10, %xmm1 + movdqa %xmm11, %xmm14 + movdqa %xmm7, %xmm11 + pand %xmm10, %xmm11 + por %xmm14, %xmm11 + movdqa %xmm10, %xmm14 + pandn %xmm7, %xmm14 + movdqa %xmm4, %xmm7 + pcmpgtd %xmm9, %xmm7 + por %xmm14, %xmm1 + movaps %xmm1, -80(%rbp) + movdqa %xmm7, %xmm10 + movdqa %xmm7, %xmm1 + movdqa %xmm9, %xmm7 + pandn %xmm4, %xmm10 + pand %xmm1, %xmm7 + pand %xmm1, %xmm4 + por %xmm10, %xmm7 + movdqa %xmm1, %xmm10 + movdqa %xmm0, %xmm1 + pandn %xmm9, %xmm10 + por %xmm10, %xmm4 + movdqu (%rcx), %xmm10 + pcmpgtd %xmm10, %xmm1 + movdqu (%rcx), %xmm10 + movdqu (%rcx), %xmm15 + movdqa %xmm1, %xmm9 + pand %xmm1, %xmm10 + pandn %xmm0, %xmm9 + pand %xmm1, %xmm0 + por %xmm9, %xmm10 + movdqa %xmm1, %xmm9 + movdqu (%rax), %xmm1 + pandn %xmm15, %xmm9 + movdqa -112(%rbp), %xmm15 + por %xmm9, %xmm0 + pcmpgtd %xmm15, %xmm1 + movaps %xmm0, -96(%rbp) + movdqu (%rax), %xmm0 + movdqa %xmm1, %xmm9 + pandn %xmm0, %xmm9 + movdqa %xmm15, %xmm0 + pand %xmm1, %xmm0 + por %xmm9, %xmm0 + movdqa %xmm1, %xmm9 + pandn %xmm15, %xmm9 + movdqu (%rax), %xmm15 + pand %xmm15, %xmm1 + movdqa %xmm13, %xmm15 + por %xmm9, %xmm1 + movdqa %xmm8, %xmm9 + pcmpgtd %xmm13, %xmm9 + movdqa %xmm9, %xmm14 + pandn %xmm8, %xmm9 + pand %xmm14, %xmm15 + pand %xmm14, %xmm8 + por %xmm15, %xmm9 + movdqa %xmm14, %xmm15 + movdqa %xmm6, %xmm14 + pandn %xmm13, %xmm15 + movdqa %xmm5, %xmm13 + pcmpgtd %xmm6, %xmm13 + por %xmm8, %xmm15 + movdqa %xmm13, %xmm8 + pand %xmm13, %xmm14 + pandn %xmm5, %xmm8 + pand %xmm13, %xmm5 + por %xmm14, %xmm8 + movdqa %xmm13, %xmm14 + pandn %xmm6, %xmm14 + movdqa %xmm3, %xmm6 + pcmpgtd %xmm12, %xmm6 + por %xmm14, %xmm5 + movdqa -64(%rbp), %xmm14 + movaps %xmm5, -112(%rbp) + movdqa %xmm12, %xmm5 + movdqa %xmm6, %xmm13 + pand %xmm6, %xmm5 + pandn %xmm3, %xmm13 + pand %xmm6, %xmm3 + por %xmm13, %xmm5 + movdqa %xmm6, %xmm13 + movdqa %xmm14, %xmm6 + pandn %xmm12, %xmm13 + movdqa %xmm2, %xmm12 + pcmpgtd %xmm14, %xmm12 + por %xmm13, %xmm3 + movdqa %xmm12, %xmm13 + pand %xmm12, %xmm6 + pandn %xmm2, %xmm13 + pand %xmm12, %xmm2 + por %xmm13, %xmm6 + movdqa %xmm12, %xmm13 + pandn %xmm14, %xmm13 + movdqa %xmm11, %xmm14 + por %xmm13, %xmm2 + movdqa %xmm7, %xmm13 + pcmpgtd %xmm11, %xmm13 + movdqa %xmm13, %xmm12 + pand %xmm13, %xmm14 + pandn %xmm7, %xmm12 + pand %xmm13, %xmm7 + por %xmm14, %xmm12 + movdqa %xmm13, %xmm14 + pandn %xmm11, %xmm14 + movdqa %xmm4, %xmm11 + por %xmm14, %xmm7 + movdqa -80(%rbp), %xmm14 + movaps %xmm7, -128(%rbp) + pcmpgtd %xmm14, %xmm11 + movdqa %xmm14, %xmm13 + movdqa %xmm11, %xmm7 + pand %xmm11, %xmm13 + pandn %xmm4, %xmm7 + pand %xmm11, %xmm4 + por %xmm13, %xmm7 + movdqa %xmm11, %xmm13 + movdqa %xmm0, %xmm11 + pcmpgtd %xmm10, %xmm11 + pandn %xmm14, %xmm13 + movdqa -96(%rbp), %xmm14 + por %xmm13, %xmm4 + movaps %xmm4, -80(%rbp) + movdqa %xmm10, %xmm4 + movdqa %xmm11, %xmm13 + pand %xmm11, %xmm4 + pandn %xmm0, %xmm13 + pand %xmm11, %xmm0 + por %xmm13, %xmm4 + movdqa %xmm11, %xmm13 + movdqa %xmm1, %xmm11 + pcmpgtd %xmm14, %xmm11 + pandn %xmm10, %xmm13 + movdqa %xmm14, %xmm10 + por %xmm13, %xmm0 + movdqa %xmm11, %xmm13 + pand %xmm11, %xmm10 + pandn %xmm1, %xmm13 + pand %xmm11, %xmm1 + por %xmm13, %xmm10 + movdqa %xmm11, %xmm13 + pandn %xmm14, %xmm13 + movdqa %xmm9, %xmm14 + por %xmm13, %xmm1 + movdqa %xmm5, %xmm13 + pcmpgtd %xmm9, %xmm13 + movdqa %xmm13, %xmm11 + pand %xmm13, %xmm14 + pandn %xmm5, %xmm11 + pand %xmm13, %xmm5 + por %xmm14, %xmm11 + movdqa %xmm13, %xmm14 + movdqa %xmm8, %xmm13 + pandn %xmm9, %xmm14 + movdqa %xmm6, %xmm9 + pcmpgtd %xmm8, %xmm9 + por %xmm5, %xmm14 + movdqa %xmm9, %xmm5 + pand %xmm9, %xmm13 + pandn %xmm6, %xmm5 + pand %xmm9, %xmm6 + por %xmm13, %xmm5 + movdqa %xmm9, %xmm13 + movdqa %xmm15, %xmm9 + pandn %xmm8, %xmm13 + movdqa %xmm3, %xmm8 + pcmpgtd %xmm15, %xmm8 + por %xmm13, %xmm6 + movaps %xmm6, -96(%rbp) + movdqa %xmm8, %xmm6 + pand %xmm8, %xmm9 + pandn %xmm3, %xmm6 + pand %xmm8, %xmm3 + por %xmm9, %xmm6 + movdqa %xmm8, %xmm9 + movdqa %xmm2, %xmm8 + pandn %xmm15, %xmm9 + movdqa -112(%rbp), %xmm15 + por %xmm9, %xmm3 + pcmpgtd %xmm15, %xmm8 + movaps %xmm3, -144(%rbp) + movdqa %xmm15, %xmm9 + movdqa %xmm8, %xmm3 + pand %xmm8, %xmm9 + pandn %xmm2, %xmm3 + pand %xmm8, %xmm2 + por %xmm9, %xmm3 + movdqa %xmm8, %xmm9 + movdqa %xmm12, %xmm8 + pandn %xmm15, %xmm9 + movdqa -128(%rbp), %xmm15 + por %xmm9, %xmm2 + movaps %xmm2, -64(%rbp) + movdqa %xmm4, %xmm2 + pcmpgtd %xmm12, %xmm2 + movdqa %xmm2, %xmm9 + pand %xmm2, %xmm8 + pandn %xmm4, %xmm9 + pand %xmm2, %xmm4 + por %xmm9, %xmm8 + movdqa %xmm2, %xmm9 + movdqa %xmm10, %xmm2 + pcmpgtd %xmm7, %xmm2 + pandn %xmm12, %xmm9 + por %xmm9, %xmm4 + movdqa %xmm7, %xmm9 + movdqa %xmm2, %xmm12 + pand %xmm2, %xmm9 + pandn %xmm10, %xmm12 + pand %xmm2, %xmm10 + por %xmm12, %xmm9 + movdqa %xmm2, %xmm12 + movdqa %xmm15, %xmm2 + pandn %xmm7, %xmm12 + movdqa %xmm0, %xmm7 + pcmpgtd %xmm15, %xmm7 + por %xmm12, %xmm10 + movdqa %xmm7, %xmm12 + pand %xmm7, %xmm2 + pandn %xmm0, %xmm12 + pand %xmm7, %xmm0 + por %xmm12, %xmm2 + movdqa %xmm7, %xmm12 + pandn %xmm15, %xmm12 + movdqa -80(%rbp), %xmm15 + por %xmm12, %xmm0 + movdqa %xmm1, %xmm12 + pcmpgtd %xmm15, %xmm12 + movdqa %xmm12, %xmm7 + pandn %xmm1, %xmm7 + pand %xmm12, %xmm1 + movdqa %xmm7, %xmm13 + movdqa %xmm15, %xmm7 + pand %xmm12, %xmm7 + por %xmm13, %xmm7 + movdqa %xmm12, %xmm13 + movdqa %xmm8, %xmm12 + pcmpgtd %xmm11, %xmm12 + pandn %xmm15, %xmm13 + por %xmm13, %xmm1 + movdqa %xmm12, %xmm15 + pandn %xmm8, %xmm15 + pand %xmm12, %xmm8 + movdqa %xmm15, %xmm13 + movdqa %xmm11, %xmm15 + pand %xmm12, %xmm15 + por %xmm13, %xmm15 + movaps %xmm15, -112(%rbp) + movdqa %xmm12, %xmm15 + movdqa %xmm5, %xmm12 + pandn %xmm11, %xmm15 + movdqa %xmm15, %xmm13 + movdqa %xmm8, %xmm15 + movdqa %xmm9, %xmm8 + pcmpgtd %xmm5, %xmm8 + por %xmm13, %xmm15 + movdqa -96(%rbp), %xmm13 + movdqa %xmm8, %xmm11 + pand %xmm8, %xmm12 + pandn %xmm9, %xmm11 + pand %xmm8, %xmm9 + por %xmm11, %xmm12 + movdqa %xmm8, %xmm11 + movdqa %xmm2, %xmm8 + pcmpgtd %xmm6, %xmm8 + pandn %xmm5, %xmm11 + movdqa %xmm6, %xmm5 + movaps %xmm12, -128(%rbp) + por %xmm11, %xmm9 + movdqa %xmm8, %xmm11 + pand %xmm8, %xmm5 + pandn %xmm2, %xmm11 + pand %xmm8, %xmm2 + por %xmm11, %xmm5 + movdqa %xmm8, %xmm11 + movdqa %xmm7, %xmm8 + pcmpgtd %xmm3, %xmm8 + pandn %xmm6, %xmm11 + movdqa %xmm3, %xmm6 + movaps %xmm5, -80(%rbp) + por %xmm11, %xmm2 + movdqa %xmm8, %xmm11 + pand %xmm8, %xmm6 + pandn %xmm7, %xmm11 + pand %xmm8, %xmm7 + por %xmm11, %xmm6 + movdqa %xmm8, %xmm11 + movdqa %xmm4, %xmm8 + pcmpgtd %xmm14, %xmm8 + pandn %xmm3, %xmm11 + movdqa %xmm14, %xmm3 + por %xmm11, %xmm7 + movdqa %xmm8, %xmm11 + pand %xmm8, %xmm3 + pandn %xmm4, %xmm11 + pand %xmm8, %xmm4 + por %xmm11, %xmm3 + movdqa %xmm8, %xmm11 + movdqa %xmm13, %xmm8 + pandn %xmm14, %xmm11 + movdqa -144(%rbp), %xmm14 + por %xmm11, %xmm4 + movdqa %xmm10, %xmm11 + pcmpgtd %xmm13, %xmm11 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm8 + pandn %xmm10, %xmm12 + pand %xmm11, %xmm10 + por %xmm12, %xmm8 + movdqa %xmm11, %xmm12 + movdqa %xmm0, %xmm11 + pcmpgtd %xmm14, %xmm11 + pandn %xmm13, %xmm12 + por %xmm12, %xmm10 + movdqa %xmm14, %xmm12 + movdqa %xmm11, %xmm13 + pand %xmm11, %xmm12 + pandn %xmm0, %xmm13 + pand %xmm11, %xmm0 + por %xmm13, %xmm12 + movdqa %xmm11, %xmm13 + pandn %xmm14, %xmm13 + movdqa -64(%rbp), %xmm14 + por %xmm13, %xmm0 + movdqa %xmm1, %xmm13 + pcmpgtd %xmm14, %xmm13 + movdqa %xmm13, %xmm11 + movdqa %xmm13, %xmm5 + pandn -64(%rbp), %xmm5 + pandn %xmm1, %xmm11 + pand %xmm13, %xmm1 + pand %xmm13, %xmm14 + por %xmm5, %xmm1 + por %xmm14, %xmm11 + movdqa %xmm8, %xmm13 + movaps %xmm1, -192(%rbp) + movdqa %xmm2, %xmm1 + pcmpgtd %xmm8, %xmm1 + movdqa %xmm1, %xmm14 + pand %xmm1, %xmm13 + pandn %xmm2, %xmm14 + pand %xmm1, %xmm2 + por %xmm14, %xmm13 + movdqa %xmm1, %xmm14 + movdqa %xmm12, %xmm1 + pandn %xmm8, %xmm14 + movdqa %xmm9, %xmm8 + pcmpgtd %xmm12, %xmm8 + por %xmm14, %xmm2 + movdqa %xmm8, %xmm14 + pand %xmm8, %xmm1 + pandn %xmm9, %xmm14 + pand %xmm8, %xmm9 + por %xmm14, %xmm1 + movdqa %xmm8, %xmm14 + movdqa %xmm4, %xmm8 + pcmpgtd %xmm6, %xmm8 + pandn %xmm12, %xmm14 + por %xmm14, %xmm9 + movdqa %xmm6, %xmm14 + movdqa %xmm8, %xmm5 + pand %xmm8, %xmm14 + pandn %xmm4, %xmm5 + pand %xmm8, %xmm4 + por %xmm5, %xmm14 + movdqa %xmm8, %xmm5 + movdqa %xmm11, %xmm8 + pandn %xmm6, %xmm5 + movdqa %xmm7, %xmm6 + pcmpgtd %xmm11, %xmm6 + por %xmm5, %xmm4 + movdqa %xmm6, %xmm5 + pand %xmm6, %xmm8 + pandn %xmm7, %xmm5 + pand %xmm6, %xmm7 + por %xmm5, %xmm8 + movdqa %xmm6, %xmm5 + movdqa %xmm10, %xmm6 + pandn %xmm11, %xmm5 + movdqa %xmm0, %xmm11 + pcmpgtd %xmm10, %xmm11 + por %xmm5, %xmm7 + movdqa %xmm11, %xmm5 + pand %xmm11, %xmm6 + pandn %xmm0, %xmm5 + pand %xmm11, %xmm0 + por %xmm5, %xmm6 + movdqa %xmm11, %xmm5 + movdqa %xmm15, %xmm11 + pcmpgtd %xmm3, %xmm11 + pandn %xmm10, %xmm5 + movdqa %xmm3, %xmm10 + por %xmm5, %xmm0 + movaps %xmm0, -64(%rbp) + movdqa -128(%rbp), %xmm0 + movdqa %xmm11, %xmm5 + pand %xmm11, %xmm10 + pandn %xmm15, %xmm5 + pand %xmm11, %xmm15 + por %xmm5, %xmm10 + movdqa %xmm11, %xmm5 + pandn %xmm3, %xmm5 + movdqa %xmm0, %xmm3 + por %xmm5, %xmm15 + movdqa -80(%rbp), %xmm5 + movdqa %xmm5, %xmm11 + pcmpgtd %xmm0, %xmm11 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm3 + pandn %xmm5, %xmm12 + movdqa %xmm11, %xmm5 + pandn %xmm0, %xmm5 + por %xmm12, %xmm3 + movdqa %xmm5, %xmm12 + movdqa -80(%rbp), %xmm5 + movdqa %xmm3, %xmm0 + pand %xmm11, %xmm5 + movdqa %xmm10, %xmm11 + pcmpgtd %xmm3, %xmm11 + por %xmm12, %xmm5 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm0 + pandn %xmm10, %xmm12 + pand %xmm11, %xmm10 + por %xmm0, %xmm12 + movdqa -64(%rbp), %xmm0 + movaps %xmm12, -128(%rbp) + movdqa %xmm11, %xmm12 + movdqa %xmm6, %xmm11 + pcmpgtd %xmm8, %xmm11 + pandn %xmm3, %xmm12 + movdqa %xmm8, %xmm3 + por %xmm12, %xmm10 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm3 + pandn %xmm6, %xmm12 + pand %xmm11, %xmm6 + por %xmm12, %xmm3 + movdqa %xmm11, %xmm12 + movdqa %xmm15, %xmm11 + pcmpgtd %xmm5, %xmm11 + pandn %xmm8, %xmm12 + movdqa %xmm5, %xmm8 + por %xmm12, %xmm6 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm8 + pandn %xmm15, %xmm12 + pand %xmm11, %xmm15 + por %xmm12, %xmm8 + movdqa %xmm11, %xmm12 + movdqa %xmm7, %xmm11 + pandn %xmm5, %xmm12 + movdqa %xmm0, %xmm5 + pcmpgtd %xmm7, %xmm5 + por %xmm12, %xmm15 + movdqa %xmm5, %xmm12 + pand %xmm5, %xmm11 + pandn %xmm0, %xmm12 + pand %xmm5, %xmm0 + por %xmm12, %xmm11 + movdqa %xmm5, %xmm12 + pandn %xmm7, %xmm12 + movdqa %xmm8, %xmm7 + por %xmm12, %xmm0 + movdqa %xmm1, %xmm12 + movaps %xmm0, -208(%rbp) + movdqa %xmm10, %xmm0 + pcmpgtd %xmm13, %xmm12 + cmpq $1, -240(%rbp) + pcmpgtd %xmm8, %xmm0 + movdqa %xmm0, %xmm5 + pand %xmm0, %xmm7 + pandn %xmm10, %xmm5 + pand %xmm0, %xmm10 + por %xmm5, %xmm7 + movdqa %xmm0, %xmm5 + movdqa %xmm12, %xmm0 + pandn %xmm8, %xmm5 + pandn %xmm1, %xmm0 + pand %xmm12, %xmm1 + movaps %xmm7, -144(%rbp) + por %xmm5, %xmm10 + movdqa %xmm13, %xmm5 + movdqa %xmm9, %xmm7 + pand %xmm12, %xmm5 + movdqa %xmm11, %xmm8 + por %xmm0, %xmm5 + movdqa %xmm12, %xmm0 + pandn %xmm13, %xmm0 + movdqa %xmm6, %xmm13 + por %xmm0, %xmm1 + pcmpgtd %xmm11, %xmm13 + movdqa %xmm2, %xmm0 + pcmpgtd %xmm9, %xmm0 + movdqa %xmm1, %xmm12 + pand %xmm13, %xmm8 + movdqa %xmm0, %xmm1 + pand %xmm0, %xmm7 + pandn %xmm2, %xmm1 + pand %xmm0, %xmm2 + por %xmm1, %xmm7 + movdqa %xmm0, %xmm1 + movdqa %xmm13, %xmm0 + pandn %xmm9, %xmm1 + pandn %xmm6, %xmm0 + movdqa %xmm14, %xmm9 + por %xmm1, %xmm2 + movdqa %xmm15, %xmm1 + por %xmm0, %xmm8 + pcmpgtd %xmm14, %xmm1 + movdqa %xmm13, %xmm0 + pand %xmm6, %xmm13 + pandn %xmm11, %xmm0 + movdqa %xmm3, %xmm6 + movdqa %xmm7, %xmm11 + por %xmm0, %xmm13 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm9 + pandn %xmm15, %xmm0 + pand %xmm1, %xmm15 + por %xmm0, %xmm9 + movdqa %xmm1, %xmm0 + pandn %xmm14, %xmm0 + movdqa %xmm9, %xmm14 + por %xmm0, %xmm15 + movdqa %xmm4, %xmm0 + pcmpgtd %xmm3, %xmm0 + movdqa %xmm0, %xmm1 + pand %xmm0, %xmm6 + pandn %xmm4, %xmm1 + pand %xmm0, %xmm4 + por %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + movdqa %xmm5, %xmm0 + pcmpgtd %xmm9, %xmm0 + pcmpgtd %xmm6, %xmm11 + pandn %xmm3, %xmm1 + por %xmm1, %xmm4 + movdqa %xmm12, %xmm3 + movdqa %xmm0, %xmm1 + pand %xmm0, %xmm14 + pandn %xmm5, %xmm1 + pand %xmm0, %xmm5 + por %xmm1, %xmm14 + movdqa %xmm0, %xmm1 + pandn %xmm9, %xmm1 + movdqa %xmm6, %xmm9 + por %xmm1, %xmm5 + movdqa %xmm15, %xmm1 + pand %xmm11, %xmm9 + pcmpgtd %xmm12, %xmm1 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm3 + pandn %xmm15, %xmm0 + por %xmm0, %xmm3 + movdqa %xmm1, %xmm0 + pand %xmm15, %xmm1 + pandn %xmm12, %xmm0 + por %xmm0, %xmm1 + movdqa %xmm11, %xmm0 + pandn %xmm7, %xmm0 + pand %xmm11, %xmm7 + por %xmm0, %xmm9 + movdqa %xmm11, %xmm0 + pandn %xmm6, %xmm0 + por %xmm0, %xmm7 + movdqa %xmm4, %xmm0 + pcmpgtd %xmm2, %xmm0 + movdqa %xmm7, %xmm11 + movdqa %xmm2, %xmm7 + movdqa %xmm0, %xmm6 + movdqa %xmm0, %xmm12 + pand %xmm0, %xmm7 + pandn %xmm2, %xmm6 + movdqa %xmm10, %xmm2 + pand %xmm4, %xmm0 + pcmpgtd %xmm14, %xmm2 + por %xmm6, %xmm0 + pandn %xmm4, %xmm12 + movdqa %xmm14, %xmm6 + por %xmm12, %xmm7 + movdqa %xmm2, %xmm4 + pand %xmm2, %xmm6 + pandn %xmm10, %xmm4 + pand %xmm2, %xmm10 + por %xmm4, %xmm6 + movdqa %xmm2, %xmm4 + movdqa %xmm3, %xmm2 + pcmpgtd %xmm5, %xmm2 + pandn %xmm14, %xmm4 + movaps %xmm6, -160(%rbp) + movdqa %xmm5, %xmm6 + por %xmm4, %xmm10 + movaps %xmm10, -64(%rbp) + movdqa %xmm8, %xmm10 + movdqa %xmm2, %xmm4 + pand %xmm2, %xmm6 + pandn %xmm3, %xmm4 + pand %xmm2, %xmm3 + por %xmm4, %xmm6 + movdqa %xmm2, %xmm4 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm9, %xmm2 + pandn %xmm5, %xmm4 + movaps %xmm6, -80(%rbp) + movdqa %xmm9, %xmm5 + por %xmm4, %xmm3 + movdqa %xmm7, %xmm6 + pcmpgtd %xmm11, %xmm6 + movdqa %xmm2, %xmm4 + pand %xmm2, %xmm5 + pandn %xmm1, %xmm4 + pand %xmm2, %xmm1 + por %xmm4, %xmm5 + movdqa %xmm2, %xmm4 + movdqa %xmm11, %xmm2 + pandn %xmm9, %xmm4 + pand %xmm6, %xmm2 + por %xmm4, %xmm1 + movdqa %xmm6, %xmm4 + pandn %xmm7, %xmm4 + pand %xmm6, %xmm7 + por %xmm4, %xmm2 + movdqa %xmm6, %xmm4 + pandn %xmm11, %xmm4 + movdqa %xmm2, %xmm9 + por %xmm4, %xmm7 + pcmpgtd %xmm1, %xmm9 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm8, %xmm4 + movdqa %xmm4, %xmm6 + pand %xmm4, %xmm10 + pandn %xmm0, %xmm6 + pand %xmm4, %xmm0 + por %xmm6, %xmm10 + movdqa %xmm4, %xmm6 + movdqa %xmm5, %xmm4 + pcmpgtd %xmm3, %xmm4 + pandn %xmm8, %xmm6 + movdqa %xmm3, %xmm8 + por %xmm6, %xmm0 + movdqa %xmm4, %xmm6 + pand %xmm4, %xmm8 + pandn %xmm5, %xmm6 + pand %xmm4, %xmm5 + por %xmm6, %xmm8 + movdqa %xmm4, %xmm6 + movdqa %xmm9, %xmm4 + pandn %xmm3, %xmm6 + movdqa %xmm1, %xmm3 + pandn %xmm2, %xmm4 + movaps %xmm8, -96(%rbp) + pand %xmm9, %xmm3 + por %xmm6, %xmm5 + pand %xmm9, %xmm2 + por %xmm4, %xmm3 + movdqa %xmm9, %xmm4 + movdqa %xmm5, %xmm12 + pandn %xmm1, %xmm4 + por %xmm4, %xmm2 + jbe .L481 + movdqa -112(%rbp), %xmm5 + pshufd $177, %xmm13, %xmm14 + pshufd $177, -192(%rbp), %xmm13 + movdqa %xmm13, %xmm8 + pshufd $177, %xmm3, %xmm6 + pshufd $177, %xmm0, %xmm0 + pshufd $177, %xmm7, %xmm7 + pshufd $177, -208(%rbp), %xmm15 + pcmpgtd %xmm5, %xmm8 + movaps %xmm6, -176(%rbp) + movdqa %xmm5, %xmm4 + movdqa %xmm15, %xmm9 + movdqa -160(%rbp), %xmm3 + pshufd $177, %xmm10, %xmm10 + pshufd $177, %xmm2, %xmm2 + movdqa %xmm8, %xmm6 + movdqa %xmm8, %xmm1 + pand %xmm8, %xmm4 + pandn %xmm5, %xmm6 + movdqa -128(%rbp), %xmm5 + pandn %xmm13, %xmm1 + pand %xmm13, %xmm8 + por %xmm1, %xmm4 + movaps %xmm6, -208(%rbp) + movdqa -144(%rbp), %xmm6 + pcmpgtd %xmm5, %xmm9 + movdqa %xmm5, %xmm11 + movaps %xmm4, -192(%rbp) + movdqa %xmm6, %xmm4 + movdqa %xmm9, %xmm1 + pand %xmm9, %xmm11 + pandn %xmm15, %xmm1 + por %xmm1, %xmm11 + movdqa %xmm9, %xmm1 + pand %xmm15, %xmm9 + pandn %xmm5, %xmm1 + movdqa %xmm14, %xmm5 + pcmpgtd %xmm6, %xmm5 + movaps %xmm1, -224(%rbp) + movdqa %xmm5, %xmm1 + pand %xmm5, %xmm4 + pandn %xmm14, %xmm1 + por %xmm1, %xmm4 + movdqa %xmm5, %xmm1 + pand %xmm14, %xmm5 + pandn %xmm6, %xmm1 + movaps %xmm4, -112(%rbp) + movdqa %xmm3, %xmm6 + movaps %xmm1, -288(%rbp) + movdqa %xmm0, %xmm1 + pcmpgtd %xmm3, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm6 + pandn %xmm0, %xmm4 + por %xmm4, %xmm6 + movdqa %xmm1, %xmm4 + pand %xmm0, %xmm1 + pandn %xmm3, %xmm4 + movaps %xmm6, -128(%rbp) + movdqa -64(%rbp), %xmm6 + movaps %xmm4, -304(%rbp) + movdqa %xmm10, %xmm4 + por -304(%rbp), %xmm1 + pcmpgtd %xmm6, %xmm4 + pshufd $177, %xmm1, %xmm1 + movdqa %xmm4, %xmm3 + pandn %xmm10, %xmm3 + pand %xmm4, %xmm10 + movaps %xmm3, -320(%rbp) + movdqa %xmm4, %xmm3 + pand -64(%rbp), %xmm4 + por -320(%rbp), %xmm4 + pandn %xmm6, %xmm3 + por %xmm3, %xmm10 + movdqa %xmm7, %xmm3 + pshufd $177, %xmm4, %xmm4 + movaps %xmm10, -144(%rbp) + movdqa -80(%rbp), %xmm10 + pcmpgtd %xmm10, %xmm3 + movdqa %xmm3, %xmm6 + pandn %xmm7, %xmm3 + movaps %xmm3, -336(%rbp) + movdqa %xmm6, %xmm3 + pand %xmm6, %xmm7 + pand -80(%rbp), %xmm6 + pandn %xmm10, %xmm3 + por %xmm3, %xmm7 + movdqa %xmm2, %xmm3 + movaps %xmm7, -160(%rbp) + movdqa -96(%rbp), %xmm7 + pcmpgtd %xmm7, %xmm3 + movdqa %xmm3, %xmm10 + pandn %xmm2, %xmm3 + movaps %xmm3, -352(%rbp) + movdqa %xmm10, %xmm3 + pand %xmm10, %xmm2 + pandn %xmm7, %xmm3 + movdqa -176(%rbp), %xmm7 + por %xmm3, %xmm2 + pcmpgtd %xmm12, %xmm7 + movdqa %xmm7, %xmm3 + pandn -176(%rbp), %xmm3 + movaps %xmm3, -368(%rbp) + movdqa %xmm7, %xmm3 + pandn %xmm12, %xmm3 + movaps %xmm3, -384(%rbp) + movdqa -176(%rbp), %xmm3 + pand %xmm7, %xmm3 + pand %xmm12, %xmm7 + por -384(%rbp), %xmm3 + por -336(%rbp), %xmm6 + por -368(%rbp), %xmm7 + movdqa -192(%rbp), %xmm13 + pand -96(%rbp), %xmm10 + pshufd $177, %xmm7, %xmm7 + pshufd $177, %xmm6, %xmm6 + por -352(%rbp), %xmm10 + movdqa %xmm7, %xmm0 + por -288(%rbp), %xmm5 + por -224(%rbp), %xmm9 + pcmpgtd %xmm13, %xmm0 + pshufd $177, %xmm10, %xmm14 + por -208(%rbp), %xmm8 + pshufd $177, %xmm9, %xmm15 + movdqa %xmm13, %xmm9 + movaps %xmm14, -64(%rbp) + pshufd $177, %xmm5, %xmm5 + pshufd $177, %xmm8, %xmm12 + movdqa %xmm3, %xmm8 + movdqa %xmm0, %xmm10 + pandn %xmm7, %xmm0 + pand %xmm10, %xmm9 + por %xmm0, %xmm9 + movdqa %xmm10, %xmm0 + pand %xmm7, %xmm10 + pandn %xmm13, %xmm0 + movdqa %xmm12, %xmm13 + pcmpgtd %xmm3, %xmm13 + movaps %xmm0, -80(%rbp) + movdqa %xmm13, %xmm0 + pand %xmm13, %xmm8 + pandn %xmm12, %xmm0 + por %xmm0, %xmm8 + movdqa %xmm13, %xmm0 + pand %xmm12, %xmm13 + pandn %xmm3, %xmm0 + movaps %xmm8, -224(%rbp) + movdqa %xmm11, %xmm8 + movaps %xmm0, -304(%rbp) + movdqa %xmm14, %xmm0 + pcmpgtd %xmm11, %xmm0 + movdqa %xmm0, %xmm3 + pandn -64(%rbp), %xmm3 + pand %xmm0, %xmm8 + por %xmm3, %xmm8 + movdqa %xmm0, %xmm3 + pand -64(%rbp), %xmm0 + pandn %xmm11, %xmm3 + movdqa %xmm2, %xmm11 + movaps %xmm8, -96(%rbp) + movaps %xmm3, -320(%rbp) + movdqa %xmm15, %xmm3 + por -320(%rbp), %xmm0 + pcmpgtd %xmm2, %xmm3 + pshufd $177, %xmm0, %xmm0 + movdqa %xmm3, %xmm14 + pand %xmm3, %xmm11 + pandn %xmm15, %xmm14 + por %xmm14, %xmm11 + movdqa %xmm3, %xmm14 + pand %xmm15, %xmm3 + movaps %xmm11, -176(%rbp) + movdqa -112(%rbp), %xmm11 + pandn %xmm2, %xmm14 + movdqa %xmm6, %xmm2 + movaps %xmm14, -336(%rbp) + pcmpgtd %xmm11, %xmm2 + movdqa %xmm2, %xmm14 + movdqa %xmm2, %xmm8 + pandn %xmm6, %xmm14 + pand %xmm2, %xmm6 + pandn %xmm11, %xmm8 + movaps %xmm14, -352(%rbp) + movdqa %xmm6, %xmm14 + movdqa %xmm5, %xmm6 + pand -112(%rbp), %xmm2 + por %xmm8, %xmm14 + por -352(%rbp), %xmm2 + movdqa -160(%rbp), %xmm8 + movaps %xmm14, -192(%rbp) + pcmpgtd %xmm8, %xmm6 + pshufd $177, %xmm2, %xmm2 + movdqa %xmm6, %xmm14 + pandn %xmm5, %xmm6 + movaps %xmm6, -368(%rbp) + movdqa %xmm14, %xmm6 + pand %xmm14, %xmm5 + pandn %xmm8, %xmm6 + por %xmm6, %xmm5 + movdqa -128(%rbp), %xmm6 + movaps %xmm5, -208(%rbp) + movdqa %xmm4, %xmm5 + pcmpgtd %xmm6, %xmm5 + movdqa %xmm5, %xmm8 + pandn %xmm4, %xmm5 + movdqa %xmm5, %xmm11 + movdqa %xmm8, %xmm5 + pand %xmm8, %xmm4 + pandn %xmm6, %xmm5 + pand -128(%rbp), %xmm8 + por %xmm5, %xmm4 + movdqa %xmm1, %xmm5 + pcmpgtd -144(%rbp), %xmm5 + movaps %xmm4, -288(%rbp) + por %xmm11, %xmm8 + pshufd $177, %xmm8, %xmm8 + movdqa %xmm5, %xmm6 + movdqa %xmm5, %xmm4 + pandn -144(%rbp), %xmm4 + pandn %xmm1, %xmm6 + por -80(%rbp), %xmm10 + pand %xmm5, %xmm1 + movdqa -288(%rbp), %xmm15 + pand -144(%rbp), %xmm5 + por %xmm4, %xmm1 + por -304(%rbp), %xmm13 + pshufd $177, %xmm10, %xmm11 + movdqa %xmm15, %xmm4 + por -336(%rbp), %xmm3 + por %xmm6, %xmm5 + pshufd $177, %xmm13, %xmm10 + movdqa -96(%rbp), %xmm6 + movaps %xmm11, -128(%rbp) + pshufd $177, %xmm5, %xmm7 + movaps %xmm10, -80(%rbp) + movdqa %xmm9, %xmm10 + movdqa -192(%rbp), %xmm5 + movaps %xmm7, -64(%rbp) + movdqa %xmm8, %xmm7 + pshufd $177, %xmm3, %xmm3 + pand -160(%rbp), %xmm14 + por -368(%rbp), %xmm14 + pcmpgtd %xmm9, %xmm7 + pshufd $177, %xmm14, %xmm14 + movdqa %xmm7, %xmm13 + pand %xmm7, %xmm10 + pandn %xmm8, %xmm13 + por %xmm13, %xmm10 + movdqa %xmm7, %xmm13 + pand %xmm8, %xmm7 + pandn %xmm9, %xmm13 + movdqa %xmm2, %xmm9 + movdqa %xmm10, %xmm8 + pcmpgtd %xmm6, %xmm9 + movaps %xmm13, -304(%rbp) + movdqa %xmm9, %xmm13 + pandn %xmm2, %xmm9 + movdqa %xmm13, %xmm12 + pand %xmm13, %xmm2 + movaps %xmm9, -320(%rbp) + pandn %xmm6, %xmm12 + por %xmm12, %xmm2 + movdqa %xmm11, %xmm12 + pcmpgtd %xmm15, %xmm12 + movdqa %xmm12, %xmm6 + pandn -128(%rbp), %xmm12 + pand %xmm6, %xmm4 + movdqa %xmm4, %xmm9 + por %xmm12, %xmm9 + movdqa %xmm6, %xmm12 + pandn %xmm15, %xmm12 + movdqa -224(%rbp), %xmm15 + movaps %xmm12, -288(%rbp) + movdqa %xmm0, %xmm12 + pcmpgtd %xmm5, %xmm12 + movdqa %xmm12, %xmm4 + pandn %xmm0, %xmm4 + pand %xmm12, %xmm0 + movaps %xmm4, -336(%rbp) + movdqa %xmm12, %xmm4 + pandn %xmm5, %xmm4 + por %xmm4, %xmm0 + movdqa -64(%rbp), %xmm4 + movaps %xmm0, -144(%rbp) + movdqa %xmm4, %xmm11 + pcmpgtd %xmm15, %xmm11 + movdqa %xmm11, %xmm5 + pandn -64(%rbp), %xmm11 + movdqa %xmm11, %xmm4 + movdqa %xmm15, %xmm11 + pand %xmm5, %xmm11 + por %xmm4, %xmm11 + movdqa %xmm5, %xmm4 + pandn %xmm15, %xmm4 + movaps %xmm11, -160(%rbp) + movdqa -176(%rbp), %xmm15 + movdqa %xmm14, %xmm11 + movaps %xmm4, -352(%rbp) + pcmpgtd %xmm15, %xmm11 + movdqa %xmm11, %xmm4 + pandn %xmm14, %xmm4 + pand %xmm11, %xmm14 + movaps %xmm4, -368(%rbp) + movdqa %xmm11, %xmm4 + pandn %xmm15, %xmm4 + movdqa -80(%rbp), %xmm15 + por %xmm4, %xmm14 + movdqa %xmm15, %xmm4 + movaps %xmm14, -112(%rbp) + movdqa %xmm1, %xmm15 + pcmpgtd %xmm1, %xmm4 + movdqa %xmm4, %xmm14 + pandn -80(%rbp), %xmm14 + pand %xmm4, %xmm15 + por %xmm14, %xmm15 + movdqa %xmm4, %xmm14 + pandn %xmm1, %xmm14 + movdqa -208(%rbp), %xmm1 + movaps %xmm14, -384(%rbp) + movdqa %xmm3, %xmm14 + pcmpgtd %xmm1, %xmm14 + movdqa %xmm14, %xmm0 + pandn %xmm3, %xmm0 + pand %xmm14, %xmm3 + movaps %xmm0, -400(%rbp) + movdqa %xmm14, %xmm0 + pandn %xmm1, %xmm0 + por %xmm0, %xmm3 + movaps %xmm3, -224(%rbp) + movdqa -96(%rbp), %xmm3 + movdqa -208(%rbp), %xmm1 + pand -80(%rbp), %xmm4 + pand -64(%rbp), %xmm5 + pand %xmm13, %xmm3 + por -384(%rbp), %xmm4 + por -320(%rbp), %xmm3 + movdqa -400(%rbp), %xmm13 + pand %xmm14, %xmm1 + por -304(%rbp), %xmm7 + pshufd $177, %xmm3, %xmm3 + pand -128(%rbp), %xmm6 + pand -192(%rbp), %xmm12 + por %xmm1, %xmm13 + pshufd $177, %xmm4, %xmm1 + pshufd $177, %xmm7, %xmm7 + movdqa -144(%rbp), %xmm4 + movaps %xmm1, -64(%rbp) + movdqa %xmm3, %xmm1 + por -336(%rbp), %xmm12 + por -288(%rbp), %xmm6 + pcmpgtd %xmm10, %xmm1 + por -352(%rbp), %xmm5 + pand -176(%rbp), %xmm11 + pshufd $177, %xmm12, %xmm12 + pshufd $177, %xmm6, %xmm6 + por -368(%rbp), %xmm11 + pshufd $177, %xmm5, %xmm5 + pshufd $177, %xmm13, %xmm13 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm8 + pshufd $177, %xmm11, %xmm11 + pandn %xmm3, %xmm0 + por %xmm0, %xmm8 + movdqa %xmm1, %xmm0 + pand %xmm3, %xmm1 + pandn %xmm10, %xmm0 + movdqa %xmm7, %xmm10 + pcmpgtd %xmm2, %xmm10 + por %xmm0, %xmm1 + movdqa %xmm2, %xmm0 + movdqa %xmm10, %xmm3 + pand %xmm10, %xmm0 + pandn %xmm7, %xmm3 + por %xmm0, %xmm3 + movdqa %xmm10, %xmm0 + pand %xmm7, %xmm10 + pandn %xmm2, %xmm0 + movdqa %xmm12, %xmm2 + pcmpgtd %xmm9, %xmm2 + movdqa %xmm0, %xmm14 + movdqa %xmm4, %xmm0 + por %xmm10, %xmm14 + movdqa %xmm9, %xmm10 + movdqa %xmm2, %xmm7 + pand %xmm2, %xmm10 + pandn %xmm12, %xmm7 + por %xmm7, %xmm10 + movdqa %xmm2, %xmm7 + pand %xmm12, %xmm2 + pandn %xmm9, %xmm7 + por %xmm7, %xmm2 + movdqa %xmm2, %xmm12 + movdqa %xmm6, %xmm2 + pcmpgtd %xmm4, %xmm2 + pand %xmm2, %xmm0 + movdqa %xmm2, %xmm7 + pandn %xmm6, %xmm7 + movdqa %xmm0, %xmm9 + movdqa %xmm11, %xmm0 + por %xmm7, %xmm9 + movdqa %xmm2, %xmm7 + pand %xmm6, %xmm2 + pandn %xmm4, %xmm7 + movdqa -160(%rbp), %xmm4 + por %xmm2, %xmm7 + pcmpgtd %xmm4, %xmm0 + movdqa %xmm4, %xmm6 + movdqa %xmm0, %xmm2 + pand %xmm0, %xmm6 + pandn %xmm11, %xmm2 + por %xmm2, %xmm6 + movdqa %xmm0, %xmm2 + pand %xmm11, %xmm0 + pandn %xmm4, %xmm2 + movaps %xmm6, -288(%rbp) + movdqa -112(%rbp), %xmm6 + por %xmm2, %xmm0 + movdqa %xmm0, %xmm11 + movdqa %xmm5, %xmm0 + pcmpgtd %xmm6, %xmm0 + movdqa %xmm0, %xmm2 + pand %xmm0, %xmm6 + movdqa %xmm0, %xmm4 + pandn %xmm5, %xmm2 + pand %xmm5, %xmm0 + movdqa %xmm13, %xmm5 + pcmpgtd %xmm15, %xmm5 + pandn -112(%rbp), %xmm4 + por %xmm2, %xmm6 + por %xmm4, %xmm0 + movaps %xmm0, -304(%rbp) + movdqa %xmm5, %xmm2 + movdqa %xmm5, %xmm0 + movdqa %xmm15, %xmm5 + pandn %xmm13, %xmm2 + pand %xmm0, %xmm5 + por %xmm2, %xmm5 + movdqa %xmm0, %xmm2 + pand %xmm13, %xmm0 + movaps %xmm5, -320(%rbp) + movdqa -64(%rbp), %xmm5 + pandn %xmm15, %xmm2 + movdqa -224(%rbp), %xmm15 + movdqa %xmm0, %xmm13 + movdqa %xmm5, %xmm4 + por %xmm2, %xmm13 + pcmpgtd %xmm15, %xmm4 + movdqa %xmm4, %xmm2 + movdqa %xmm4, %xmm0 + movdqa %xmm5, %xmm4 + pandn %xmm5, %xmm2 + movdqa %xmm15, %xmm5 + pand %xmm0, %xmm5 + por %xmm2, %xmm5 + movdqa %xmm0, %xmm2 + pand %xmm4, %xmm0 + pandn %xmm15, %xmm2 + movdqa %xmm0, %xmm15 + pshufd $177, %xmm8, %xmm0 + movaps %xmm5, -336(%rbp) + por %xmm2, %xmm15 + movdqa %xmm0, %xmm2 + pcmpgtd %xmm8, %xmm2 + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm5 + pandn %xmm0, %xmm4 + pandn %xmm8, %xmm5 + pand %xmm2, %xmm0 + pand %xmm2, %xmm8 + por %xmm5, %xmm0 + por %xmm4, %xmm8 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm8, %xmm8 + punpckldq %xmm0, %xmm8 + pshufd $177, %xmm1, %xmm0 + movdqa %xmm0, %xmm2 + movaps %xmm8, -352(%rbp) + pcmpgtd %xmm1, %xmm2 + movaps %xmm8, -112(%rbp) + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm8 + pandn %xmm0, %xmm4 + pandn %xmm1, %xmm8 + pand %xmm2, %xmm0 + pand %xmm2, %xmm1 + por %xmm8, %xmm0 + por %xmm4, %xmm1 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm1, %xmm1 + punpckldq %xmm0, %xmm1 + pshufd $177, %xmm3, %xmm0 + movaps %xmm1, -368(%rbp) + movaps %xmm1, -128(%rbp) + movdqa %xmm0, %xmm1 + pcmpgtd %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm4 + pandn %xmm0, %xmm2 + pandn %xmm3, %xmm4 + pand %xmm1, %xmm0 + pand %xmm1, %xmm3 + por %xmm4, %xmm0 + por %xmm2, %xmm3 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm3, %xmm3 + punpckldq %xmm0, %xmm3 + pshufd $177, %xmm14, %xmm0 + movdqa %xmm0, %xmm1 + movaps %xmm3, -384(%rbp) + pcmpgtd %xmm14, %xmm1 + movaps %xmm3, -144(%rbp) + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm14, %xmm3 + pand %xmm1, %xmm0 + pand %xmm1, %xmm14 + por %xmm3, %xmm0 + por %xmm2, %xmm14 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm14, %xmm14 + punpckldq %xmm0, %xmm14 + pshufd $177, %xmm10, %xmm0 + movdqa %xmm0, %xmm1 + movaps %xmm14, -160(%rbp) + cmpq $3, -240(%rbp) + pcmpgtd %xmm10, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm10, %xmm3 + pand %xmm1, %xmm0 + pand %xmm1, %xmm10 + por %xmm3, %xmm0 + por %xmm2, %xmm10 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm10, %xmm10 + punpckldq %xmm0, %xmm10 + pshufd $177, %xmm12, %xmm0 + movdqa %xmm0, %xmm1 + movaps %xmm10, -176(%rbp) + pcmpgtd %xmm12, %xmm1 + movaps %xmm10, -64(%rbp) + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm12, %xmm3 + pand %xmm1, %xmm0 + pand %xmm1, %xmm12 + por %xmm3, %xmm0 + por %xmm2, %xmm12 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm12, %xmm12 + punpckldq %xmm0, %xmm12 + pshufd $177, %xmm9, %xmm0 + movdqa %xmm0, %xmm1 + movaps %xmm12, -192(%rbp) + pcmpgtd %xmm9, %xmm1 + movaps %xmm12, -80(%rbp) + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm9, %xmm3 + pand %xmm1, %xmm0 + pand %xmm1, %xmm9 + por %xmm3, %xmm0 + por %xmm2, %xmm9 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm9, %xmm9 + punpckldq %xmm0, %xmm9 + pshufd $177, %xmm7, %xmm0 + movdqa %xmm0, %xmm1 + movaps %xmm9, -208(%rbp) + pcmpgtd %xmm7, %xmm1 + movaps %xmm9, -96(%rbp) + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm7, %xmm3 + pand %xmm1, %xmm7 + por %xmm2, %xmm7 + pand %xmm1, %xmm0 + pshufd $136, %xmm7, %xmm7 + por %xmm3, %xmm0 + movdqa %xmm7, %xmm3 + movdqa -288(%rbp), %xmm7 + pshufd $221, %xmm0, %xmm0 + punpckldq %xmm0, %xmm3 + pshufd $177, %xmm7, %xmm0 + movaps %xmm3, -224(%rbp) + movdqa %xmm3, %xmm12 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm7, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm7, %xmm3 + pand %xmm1, %xmm0 + pand %xmm7, %xmm1 + por %xmm3, %xmm0 + por %xmm2, %xmm1 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm1, %xmm1 + punpckldq %xmm0, %xmm1 + pshufd $177, %xmm11, %xmm0 + movdqa %xmm1, %xmm8 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm11, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm11, %xmm3 + pand %xmm1, %xmm11 + pand %xmm1, %xmm0 + por %xmm2, %xmm11 + por %xmm3, %xmm0 + pshufd $136, %xmm11, %xmm11 + pshufd $221, %xmm0, %xmm0 + movdqa %xmm11, %xmm7 + punpckldq %xmm0, %xmm7 + pshufd $177, %xmm6, %xmm0 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm6, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm6, %xmm3 + pand %xmm1, %xmm6 + por %xmm2, %xmm6 + pand %xmm1, %xmm0 + pshufd $136, %xmm6, %xmm6 + por %xmm3, %xmm0 + movdqa %xmm6, %xmm10 + movdqa -304(%rbp), %xmm6 + pshufd $221, %xmm0, %xmm0 + punpckldq %xmm0, %xmm10 + pshufd $177, %xmm6, %xmm0 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm6, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm2 + pandn %xmm6, %xmm3 + pand %xmm1, %xmm0 + pand %xmm6, %xmm1 + por %xmm3, %xmm0 + movdqa -320(%rbp), %xmm6 + por %xmm2, %xmm1 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm1, %xmm1 + punpckldq %xmm0, %xmm1 + movdqa %xmm1, %xmm5 + pshufd $177, %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm6, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm1, %xmm2 + pandn %xmm6, %xmm3 + pand %xmm0, %xmm1 + pand %xmm6, %xmm0 + por %xmm3, %xmm1 + movdqa -336(%rbp), %xmm6 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm13, %xmm1 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm13, %xmm2 + movdqa %xmm2, %xmm3 + movdqa %xmm2, %xmm4 + pandn %xmm1, %xmm3 + pandn %xmm13, %xmm4 + pand %xmm2, %xmm1 + pand %xmm2, %xmm13 + por %xmm4, %xmm1 + pshufd $177, %xmm6, %xmm2 + por %xmm3, %xmm13 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm13, %xmm13 + punpckldq %xmm1, %xmm13 + movdqa %xmm2, %xmm1 + pcmpgtd %xmm6, %xmm1 + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm4 + pandn %xmm2, %xmm3 + pandn %xmm6, %xmm4 + pand %xmm1, %xmm2 + pand %xmm6, %xmm1 + por %xmm4, %xmm2 + por %xmm3, %xmm1 + pshufd $221, %xmm2, %xmm2 + pshufd $177, %xmm15, %xmm3 + pshufd $136, %xmm1, %xmm1 + punpckldq %xmm2, %xmm1 + movdqa %xmm3, %xmm2 + pcmpgtd %xmm15, %xmm2 + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm6 + pandn %xmm3, %xmm4 + pandn %xmm15, %xmm6 + pand %xmm2, %xmm3 + pand %xmm2, %xmm15 + por %xmm6, %xmm3 + por %xmm4, %xmm15 + pshufd $221, %xmm3, %xmm3 + pshufd $136, %xmm15, %xmm2 + punpckldq %xmm3, %xmm2 + jbe .L482 + pshufd $27, %xmm2, %xmm2 + pshufd $27, %xmm13, %xmm3 + pshufd $27, %xmm1, %xmm4 + movdqa -352(%rbp), %xmm15 + movdqa %xmm2, %xmm9 + pshufd $27, %xmm10, %xmm6 + movdqa %xmm4, %xmm10 + movaps %xmm4, -144(%rbp) + pcmpgtd %xmm15, %xmm9 + movdqa %xmm15, %xmm13 + movdqa %xmm3, %xmm4 + movaps %xmm3, -64(%rbp) + pshufd $27, %xmm0, %xmm0 + pshufd $27, %xmm5, %xmm5 + pshufd $27, %xmm7, %xmm7 + pshufd $27, %xmm8, %xmm8 + movdqa %xmm9, %xmm1 + pand %xmm9, %xmm13 + pandn %xmm2, %xmm1 + por %xmm1, %xmm13 + movdqa %xmm9, %xmm1 + pand %xmm2, %xmm9 + pandn %xmm15, %xmm1 + movdqa -368(%rbp), %xmm15 + movaps %xmm1, -240(%rbp) + pcmpgtd %xmm15, %xmm10 + movdqa %xmm15, %xmm12 + movdqa %xmm10, %xmm11 + movdqa %xmm10, %xmm1 + pand %xmm10, %xmm12 + pandn %xmm15, %xmm11 + pandn -144(%rbp), %xmm1 + movaps %xmm11, -288(%rbp) + movdqa -384(%rbp), %xmm11 + por %xmm1, %xmm12 + pcmpgtd %xmm11, %xmm4 + movdqa %xmm11, %xmm15 + movdqa %xmm4, %xmm1 + pandn -64(%rbp), %xmm1 + pand %xmm4, %xmm15 + por %xmm1, %xmm15 + movdqa %xmm4, %xmm1 + pandn %xmm11, %xmm1 + movaps %xmm15, -80(%rbp) + movaps %xmm1, -304(%rbp) + movdqa %xmm0, %xmm1 + pcmpgtd %xmm14, %xmm1 + movdqa %xmm1, %xmm15 + movdqa %xmm1, %xmm11 + pandn %xmm0, %xmm15 + pand %xmm14, %xmm11 + por %xmm15, %xmm11 + movdqa %xmm1, %xmm15 + pand %xmm0, %xmm1 + pandn %xmm14, %xmm15 + movdqa %xmm5, %xmm14 + movaps %xmm11, -96(%rbp) + movaps %xmm15, -320(%rbp) + por -320(%rbp), %xmm1 + movdqa -176(%rbp), %xmm15 + pcmpgtd %xmm15, %xmm14 + pshufd $27, %xmm1, %xmm1 + movdqa -192(%rbp), %xmm15 + movdqa %xmm14, %xmm3 + pandn %xmm5, %xmm14 + movdqa %xmm3, %xmm11 + pand %xmm3, %xmm5 + pandn -176(%rbp), %xmm11 + movaps %xmm14, -336(%rbp) + pand -176(%rbp), %xmm3 + por -336(%rbp), %xmm3 + por %xmm11, %xmm5 + movaps %xmm5, -112(%rbp) + movdqa %xmm6, %xmm5 + pcmpgtd %xmm15, %xmm5 + movdqa -224(%rbp), %xmm15 + movdqa %xmm5, %xmm11 + pandn %xmm6, %xmm11 + pand %xmm5, %xmm6 + movaps %xmm11, -352(%rbp) + movdqa %xmm5, %xmm11 + pandn -192(%rbp), %xmm11 + pand -192(%rbp), %xmm5 + por -352(%rbp), %xmm5 + por %xmm11, %xmm6 + movdqa %xmm7, %xmm11 + movaps %xmm6, -128(%rbp) + movdqa -208(%rbp), %xmm6 + pshufd $27, %xmm5, %xmm5 + pcmpgtd %xmm6, %xmm11 + movdqa %xmm8, %xmm6 + pcmpgtd %xmm15, %xmm6 + pshufd $27, %xmm3, %xmm15 + movdqa %xmm11, %xmm14 + pandn %xmm7, %xmm14 + pand %xmm11, %xmm7 + movaps %xmm14, -368(%rbp) + movdqa %xmm11, %xmm14 + pandn -208(%rbp), %xmm14 + por %xmm14, %xmm7 + movdqa %xmm6, %xmm14 + pandn %xmm8, %xmm14 + movaps %xmm7, -160(%rbp) + pand %xmm6, %xmm8 + movdqa %xmm6, %xmm7 + pandn -224(%rbp), %xmm7 + pand -64(%rbp), %xmm4 + pand -224(%rbp), %xmm6 + pand -144(%rbp), %xmm10 + por -288(%rbp), %xmm10 + por %xmm7, %xmm8 + por -240(%rbp), %xmm9 + por %xmm14, %xmm6 + por -304(%rbp), %xmm4 + pand -208(%rbp), %xmm11 + pshufd $27, %xmm6, %xmm6 + pshufd $27, %xmm10, %xmm3 + pshufd $27, %xmm9, %xmm2 + movdqa -160(%rbp), %xmm14 + movdqa %xmm6, %xmm10 + movdqa %xmm13, %xmm9 + movaps %xmm2, -208(%rbp) + por -368(%rbp), %xmm11 + pcmpgtd %xmm13, %xmm10 + movaps %xmm3, -64(%rbp) + pshufd $27, %xmm4, %xmm4 + pshufd $27, %xmm11, %xmm11 + movdqa %xmm10, %xmm7 + movdqa %xmm10, %xmm0 + pand %xmm10, %xmm9 + pandn %xmm13, %xmm7 + movdqa %xmm2, %xmm13 + pandn %xmm6, %xmm0 + pcmpgtd %xmm8, %xmm13 + por %xmm0, %xmm9 + movdqa %xmm8, %xmm2 + movaps %xmm7, -288(%rbp) + movdqa %xmm14, %xmm7 + movaps %xmm9, -224(%rbp) + pand %xmm6, %xmm10 + movdqa %xmm13, %xmm0 + pandn -208(%rbp), %xmm0 + pand %xmm13, %xmm2 + por %xmm0, %xmm2 + movdqa %xmm13, %xmm0 + pandn %xmm8, %xmm0 + movaps %xmm2, -240(%rbp) + movaps %xmm0, -304(%rbp) + movdqa %xmm11, %xmm0 + pcmpgtd %xmm12, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm8 + pandn %xmm11, %xmm3 + pandn %xmm12, %xmm8 + movdqa %xmm3, %xmm2 + movdqa %xmm12, %xmm3 + movdqa -64(%rbp), %xmm12 + movaps %xmm8, -320(%rbp) + pand %xmm0, %xmm3 + movdqa -128(%rbp), %xmm8 + pand %xmm11, %xmm0 + por %xmm2, %xmm3 + movaps %xmm3, -144(%rbp) + movdqa %xmm12, %xmm3 + pcmpgtd %xmm14, %xmm3 + pand %xmm3, %xmm7 + movdqa %xmm3, %xmm2 + pandn -64(%rbp), %xmm2 + movdqa %xmm7, %xmm12 + movdqa %xmm3, %xmm7 + pandn %xmm14, %xmm7 + movdqa -80(%rbp), %xmm14 + por %xmm2, %xmm12 + movdqa %xmm5, %xmm2 + movaps %xmm12, -160(%rbp) + pcmpgtd %xmm14, %xmm2 + movaps %xmm7, -336(%rbp) + movdqa %xmm2, %xmm12 + movdqa %xmm2, %xmm7 + pandn %xmm5, %xmm12 + pandn %xmm14, %xmm7 + pand %xmm2, %xmm5 + por %xmm7, %xmm5 + movdqa %xmm4, %xmm7 + pand -80(%rbp), %xmm2 + pcmpgtd %xmm8, %xmm7 + movaps %xmm5, -176(%rbp) + por %xmm12, %xmm2 + pshufd $27, %xmm2, %xmm2 + movdqa %xmm7, %xmm5 + pandn %xmm4, %xmm5 + pand %xmm7, %xmm4 + movaps %xmm5, -352(%rbp) + movdqa %xmm7, %xmm5 + movdqa %xmm4, %xmm14 + movdqa %xmm15, %xmm4 + pandn %xmm8, %xmm5 + por %xmm5, %xmm14 + movdqa -96(%rbp), %xmm5 + movaps %xmm14, -192(%rbp) + pcmpgtd %xmm5, %xmm4 + movdqa %xmm4, %xmm8 + pandn %xmm15, %xmm4 + movdqa %xmm8, %xmm14 + pand %xmm8, %xmm15 + pand -96(%rbp), %xmm8 + pandn %xmm5, %xmm14 + movdqa %xmm1, %xmm5 + por %xmm14, %xmm15 + movdqa -112(%rbp), %xmm14 + por %xmm4, %xmm8 + pshufd $27, %xmm8, %xmm8 + pcmpgtd %xmm14, %xmm5 + movdqa %xmm5, %xmm9 + pandn %xmm1, %xmm9 + pand %xmm5, %xmm1 + movaps %xmm9, -368(%rbp) + por -320(%rbp), %xmm0 + movdqa %xmm5, %xmm9 + movdqa -128(%rbp), %xmm4 + pand -112(%rbp), %xmm5 + pandn %xmm14, %xmm9 + pand -64(%rbp), %xmm3 + pand %xmm7, %xmm4 + por -368(%rbp), %xmm5 + pand -208(%rbp), %xmm13 + por -304(%rbp), %xmm13 + por %xmm9, %xmm1 + movdqa -224(%rbp), %xmm6 + pshufd $27, %xmm0, %xmm0 + pshufd $27, %xmm5, %xmm7 + por -336(%rbp), %xmm3 + por -288(%rbp), %xmm10 + movdqa %xmm7, %xmm14 + pshufd $27, %xmm13, %xmm7 + movdqa %xmm2, %xmm13 + movaps %xmm7, -64(%rbp) + movdqa %xmm8, %xmm7 + pshufd $27, %xmm10, %xmm10 + por -352(%rbp), %xmm4 + pcmpgtd %xmm6, %xmm7 + movdqa %xmm10, %xmm11 + movdqa %xmm6, %xmm10 + movaps %xmm14, -224(%rbp) + movaps %xmm11, -208(%rbp) + pshufd $27, %xmm4, %xmm4 + pshufd $27, %xmm3, %xmm3 + movdqa %xmm7, %xmm9 + movdqa %xmm7, %xmm5 + pand %xmm7, %xmm10 + pandn %xmm6, %xmm9 + pandn %xmm8, %xmm5 + pand %xmm8, %xmm7 + movdqa -144(%rbp), %xmm6 + por %xmm5, %xmm10 + movaps %xmm9, -288(%rbp) + movdqa %xmm15, %xmm9 + pcmpgtd %xmm6, %xmm13 + movdqa %xmm13, %xmm5 + pandn %xmm2, %xmm5 + pand %xmm13, %xmm2 + movaps %xmm5, -304(%rbp) + movdqa %xmm13, %xmm5 + pandn %xmm6, %xmm5 + por %xmm5, %xmm2 + movdqa %xmm11, %xmm5 + movdqa -176(%rbp), %xmm11 + pcmpgtd %xmm15, %xmm5 + movdqa %xmm5, %xmm6 + pandn -208(%rbp), %xmm5 + movdqa %xmm6, %xmm12 + pand %xmm6, %xmm9 + pandn %xmm15, %xmm12 + por %xmm5, %xmm9 + movdqa -240(%rbp), %xmm15 + movaps %xmm12, -320(%rbp) + movdqa %xmm0, %xmm12 + pcmpgtd %xmm11, %xmm12 + movdqa %xmm12, %xmm5 + pandn %xmm0, %xmm5 + pand %xmm12, %xmm0 + movaps %xmm5, -336(%rbp) + movdqa %xmm12, %xmm5 + pandn %xmm11, %xmm5 + por %xmm5, %xmm0 + movdqa %xmm14, %xmm5 + movdqa %xmm15, %xmm14 + pcmpgtd %xmm15, %xmm5 + movaps %xmm0, -80(%rbp) + movdqa %xmm5, %xmm11 + pand %xmm5, %xmm14 + pandn -224(%rbp), %xmm11 + por %xmm11, %xmm14 + movdqa %xmm5, %xmm11 + pandn %xmm15, %xmm11 + movaps %xmm14, -96(%rbp) + movdqa -160(%rbp), %xmm15 + movaps %xmm11, -240(%rbp) + movdqa %xmm4, %xmm11 + pcmpgtd %xmm15, %xmm11 + movdqa %xmm11, %xmm14 + pandn %xmm4, %xmm14 + pand %xmm11, %xmm4 + movaps %xmm14, -352(%rbp) + movdqa %xmm11, %xmm14 + pandn %xmm15, %xmm14 + movdqa -64(%rbp), %xmm15 + por %xmm14, %xmm4 + movaps %xmm4, -112(%rbp) + movdqa %xmm15, %xmm4 + movdqa %xmm1, %xmm15 + pcmpgtd %xmm1, %xmm4 + movdqa %xmm4, %xmm14 + pandn -64(%rbp), %xmm14 + pand %xmm4, %xmm15 + por %xmm14, %xmm15 + movdqa %xmm4, %xmm14 + pandn %xmm1, %xmm14 + movdqa %xmm3, %xmm1 + movaps %xmm14, -368(%rbp) + movdqa -192(%rbp), %xmm14 + por -288(%rbp), %xmm7 + movdqa -160(%rbp), %xmm8 + pand -208(%rbp), %xmm6 + pcmpgtd %xmm14, %xmm1 + pshufd $27, %xmm7, %xmm7 + pand -176(%rbp), %xmm12 + pand %xmm11, %xmm8 + por -320(%rbp), %xmm6 + movdqa -192(%rbp), %xmm11 + por -336(%rbp), %xmm12 + pand -224(%rbp), %xmm5 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm11 + pshufd $27, %xmm6, %xmm6 + pandn %xmm3, %xmm0 + pand %xmm1, %xmm3 + pshufd $27, %xmm12, %xmm12 + movaps %xmm0, -384(%rbp) + movdqa %xmm1, %xmm0 + pand -64(%rbp), %xmm4 + por -352(%rbp), %xmm8 + pandn %xmm14, %xmm0 + por -240(%rbp), %xmm5 + por -384(%rbp), %xmm11 + por %xmm0, %xmm3 + pshufd $27, %xmm8, %xmm8 + por -368(%rbp), %xmm4 + movaps %xmm3, -128(%rbp) + pshufd $27, %xmm5, %xmm5 + pshufd $27, %xmm11, %xmm11 + movdqa -144(%rbp), %xmm3 + pshufd $27, %xmm4, %xmm4 + pand %xmm13, %xmm3 + por -304(%rbp), %xmm3 + movdqa %xmm10, %xmm13 + pshufd $27, %xmm3, %xmm3 + movdqa %xmm3, %xmm1 + pcmpgtd %xmm10, %xmm1 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm13 + pandn %xmm3, %xmm0 + pand %xmm1, %xmm3 + por %xmm0, %xmm13 + movdqa %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm2, %xmm1 + pandn %xmm10, %xmm0 + movdqa %xmm2, %xmm10 + por %xmm0, %xmm3 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm10 + pandn %xmm7, %xmm0 + por %xmm0, %xmm10 + movdqa %xmm1, %xmm0 + pandn %xmm2, %xmm0 + movdqa %xmm1, %xmm2 + movdqa %xmm12, %xmm1 + pcmpgtd %xmm9, %xmm1 + pand %xmm7, %xmm2 + por %xmm0, %xmm2 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm0 + pandn %xmm12, %xmm7 + pandn %xmm9, %xmm0 + movdqa %xmm7, %xmm14 + movdqa %xmm9, %xmm7 + movdqa %xmm1, %xmm9 + pand %xmm12, %xmm9 + movdqa -80(%rbp), %xmm12 + pand %xmm1, %xmm7 + movdqa %xmm6, %xmm1 + por %xmm0, %xmm9 + por %xmm14, %xmm7 + pcmpgtd %xmm12, %xmm1 + movdqa %xmm1, %xmm0 + pandn %xmm6, %xmm0 + movdqa %xmm0, %xmm14 + movdqa %xmm12, %xmm0 + pand %xmm1, %xmm0 + movdqa %xmm0, %xmm12 + movdqa %xmm1, %xmm0 + pandn -80(%rbp), %xmm0 + por %xmm14, %xmm12 + movdqa %xmm0, %xmm14 + movdqa %xmm1, %xmm0 + movdqa %xmm8, %xmm1 + pand %xmm6, %xmm0 + por %xmm14, %xmm0 + movdqa -96(%rbp), %xmm14 + movaps %xmm0, -64(%rbp) + pcmpgtd %xmm14, %xmm1 + movdqa %xmm1, %xmm6 + pand %xmm1, %xmm14 + movdqa %xmm1, %xmm0 + pandn %xmm8, %xmm6 + pand %xmm8, %xmm1 + movdqa -112(%rbp), %xmm8 + pandn -96(%rbp), %xmm0 + por %xmm6, %xmm14 + movdqa %xmm5, %xmm6 + pcmpgtd %xmm8, %xmm6 + por %xmm0, %xmm1 + movaps %xmm1, -80(%rbp) + movdqa %xmm6, %xmm1 + pandn %xmm5, %xmm6 + pand %xmm1, %xmm8 + movdqa %xmm1, %xmm0 + pand %xmm5, %xmm1 + por %xmm6, %xmm8 + movdqa %xmm11, %xmm6 + pandn -112(%rbp), %xmm0 + pcmpgtd %xmm15, %xmm6 + movdqa %xmm1, %xmm5 + por %xmm0, %xmm5 + movaps %xmm5, -96(%rbp) + movdqa %xmm6, %xmm1 + movdqa %xmm6, %xmm5 + movdqa %xmm15, %xmm6 + pandn %xmm11, %xmm5 + pand %xmm1, %xmm6 + por %xmm5, %xmm6 + movdqa %xmm1, %xmm5 + pand %xmm11, %xmm1 + pandn %xmm15, %xmm5 + movdqa -128(%rbp), %xmm15 + movdqa %xmm1, %xmm11 + movdqa %xmm4, %xmm1 + movaps %xmm6, -112(%rbp) + por %xmm5, %xmm11 + pcmpgtd %xmm15, %xmm1 + movdqa %xmm15, %xmm6 + pand %xmm1, %xmm6 + movdqa %xmm1, %xmm5 + pandn %xmm4, %xmm5 + movdqa %xmm6, %xmm0 + por %xmm5, %xmm0 + movdqa %xmm1, %xmm5 + pand %xmm4, %xmm1 + pshufd $27, %xmm13, %xmm4 + movdqa %xmm0, %xmm15 + pandn -128(%rbp), %xmm5 + movdqa %xmm4, %xmm0 + pcmpgtd %xmm13, %xmm0 + por %xmm5, %xmm1 + movaps %xmm1, -128(%rbp) + movdqa %xmm0, %xmm6 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm5 + pandn %xmm4, %xmm6 + pand %xmm4, %xmm1 + pshufd $27, %xmm3, %xmm4 + pandn %xmm13, %xmm5 + pand %xmm0, %xmm13 + movdqa %xmm4, %xmm0 + pcmpgtd %xmm3, %xmm0 + por %xmm5, %xmm1 + por %xmm6, %xmm13 + shufpd $2, %xmm1, %xmm13 + movdqa %xmm0, %xmm5 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm6 + pandn %xmm3, %xmm5 + pand %xmm4, %xmm1 + pandn %xmm4, %xmm6 + por %xmm5, %xmm1 + pand %xmm0, %xmm3 + por %xmm6, %xmm3 + movapd %xmm1, %xmm6 + movsd %xmm3, %xmm6 + pshufd $27, %xmm10, %xmm3 + movdqa %xmm3, %xmm0 + pcmpgtd %xmm10, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm5 + pandn %xmm10, %xmm0 + pandn %xmm3, %xmm5 + pand %xmm1, %xmm10 + pand %xmm3, %xmm1 + pshufd $27, %xmm2, %xmm3 + por %xmm0, %xmm1 + por %xmm5, %xmm10 + movdqa %xmm3, %xmm0 + shufpd $2, %xmm1, %xmm10 + pcmpgtd %xmm2, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm5 + pandn %xmm2, %xmm0 + pand %xmm1, %xmm2 + pand %xmm3, %xmm1 + pandn %xmm3, %xmm5 + por %xmm0, %xmm1 + por %xmm5, %xmm2 + movdqa -64(%rbp), %xmm0 + movapd %xmm1, %xmm3 + movsd %xmm2, %xmm3 + pshufd $27, %xmm7, %xmm2 + movapd %xmm3, %xmm5 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm7, %xmm3 + movdqa %xmm3, %xmm1 + pandn %xmm2, %xmm3 + movdqa %xmm3, %xmm4 + movdqa %xmm1, %xmm3 + pandn %xmm7, %xmm3 + pand %xmm1, %xmm7 + pand %xmm2, %xmm1 + pshufd $27, %xmm9, %xmm2 + por %xmm3, %xmm1 + por %xmm4, %xmm7 + movdqa %xmm2, %xmm3 + shufpd $2, %xmm1, %xmm7 + pcmpgtd %xmm9, %xmm3 + movdqa %xmm3, %xmm1 + pandn %xmm2, %xmm3 + movdqa %xmm3, %xmm4 + movdqa %xmm1, %xmm3 + pandn %xmm9, %xmm3 + pand %xmm1, %xmm9 + pand %xmm2, %xmm1 + pshufd $27, %xmm12, %xmm2 + por %xmm3, %xmm1 + por %xmm4, %xmm9 + movdqa %xmm2, %xmm3 + shufpd $2, %xmm1, %xmm9 + pcmpgtd %xmm12, %xmm3 + movdqa %xmm3, %xmm1 + pandn %xmm2, %xmm3 + movdqa %xmm3, %xmm4 + movdqa %xmm1, %xmm3 + pandn %xmm12, %xmm3 + pand %xmm1, %xmm12 + pand %xmm2, %xmm1 + pshufd $27, %xmm0, %xmm2 + por %xmm3, %xmm1 + por %xmm4, %xmm12 + movdqa %xmm2, %xmm3 + shufpd $2, %xmm1, %xmm12 + pcmpgtd %xmm0, %xmm3 + movdqa %xmm3, %xmm1 + pandn %xmm2, %xmm3 + movdqa %xmm3, %xmm4 + movdqa %xmm1, %xmm3 + pandn %xmm0, %xmm3 + pand %xmm1, %xmm0 + pand %xmm2, %xmm1 + por %xmm3, %xmm1 + por %xmm4, %xmm0 + movapd %xmm1, %xmm3 + pshufd $27, %xmm14, %xmm1 + movsd %xmm0, %xmm3 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm14, %xmm0 + movaps %xmm3, -176(%rbp) + movdqa -80(%rbp), %xmm4 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + pandn %xmm1, %xmm3 + pandn %xmm14, %xmm2 + pand %xmm0, %xmm14 + pand %xmm1, %xmm0 + por %xmm3, %xmm14 + pshufd $27, %xmm4, %xmm1 + por %xmm2, %xmm0 + shufpd $2, %xmm0, %xmm14 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm4, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm4, %xmm2 + pand %xmm0, %xmm4 + pand %xmm1, %xmm0 + pandn %xmm1, %xmm3 + por %xmm2, %xmm0 + pshufd $27, %xmm8, %xmm1 + movapd %xmm0, %xmm2 + movdqa %xmm1, %xmm0 + por %xmm3, %xmm4 + pcmpgtd %xmm8, %xmm0 + movsd %xmm4, %xmm2 + movdqa -96(%rbp), %xmm4 + movaps %xmm2, -192(%rbp) + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + pandn %xmm1, %xmm3 + pandn %xmm8, %xmm2 + pand %xmm0, %xmm8 + pand %xmm1, %xmm0 + por %xmm3, %xmm8 + pshufd $27, %xmm4, %xmm1 + por %xmm2, %xmm0 + shufpd $2, %xmm0, %xmm8 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm4, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + pandn %xmm1, %xmm3 + pandn %xmm4, %xmm2 + pand %xmm0, %xmm4 + pand %xmm1, %xmm0 + por %xmm3, %xmm4 + por %xmm2, %xmm0 + movsd %xmm4, %xmm0 + movdqa -112(%rbp), %xmm4 + movaps %xmm0, -208(%rbp) + pshufd $27, %xmm4, %xmm1 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm4, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm4, %xmm2 + pand %xmm0, %xmm4 + pand %xmm1, %xmm0 + pandn %xmm1, %xmm3 + por %xmm2, %xmm0 + pshufd $27, %xmm11, %xmm1 + por %xmm3, %xmm4 + movapd %xmm0, %xmm3 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm11, %xmm0 + movsd %xmm4, %xmm3 + movdqa -128(%rbp), %xmm4 + movaps %xmm3, -224(%rbp) + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm1, %xmm3 + pandn %xmm11, %xmm2 + pand %xmm0, %xmm11 + pand %xmm1, %xmm0 + por %xmm3, %xmm11 + pshufd $27, %xmm15, %xmm1 + por %xmm2, %xmm0 + shufpd $2, %xmm0, %xmm11 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm15, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + pandn %xmm1, %xmm3 + pandn %xmm15, %xmm2 + pand %xmm0, %xmm15 + pand %xmm1, %xmm0 + por %xmm3, %xmm15 + pshufd $27, %xmm4, %xmm1 + por %xmm2, %xmm0 + shufpd $2, %xmm0, %xmm15 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm4, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + pandn %xmm1, %xmm3 + pandn %xmm4, %xmm2 + pand %xmm0, %xmm4 + pand %xmm1, %xmm0 + por %xmm3, %xmm4 + pshufd $177, %xmm13, %xmm1 + por %xmm2, %xmm0 + shufpd $2, %xmm0, %xmm4 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm13, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm1, %xmm2 + pandn %xmm13, %xmm3 + pand %xmm0, %xmm1 + pand %xmm13, %xmm0 + por %xmm3, %xmm1 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm6, %xmm1 + movaps %xmm0, -112(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm6, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm13 + pandn %xmm1, %xmm2 + pandn %xmm6, %xmm13 + pand %xmm0, %xmm1 + pand %xmm6, %xmm0 + por %xmm13, %xmm1 + movapd -224(%rbp), %xmm6 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm10, %xmm1 + movaps %xmm0, -128(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm10, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm13 + pandn %xmm1, %xmm2 + pandn %xmm10, %xmm13 + pand %xmm0, %xmm1 + pand %xmm10, %xmm0 + por %xmm13, %xmm1 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm5, %xmm1 + movaps %xmm0, -144(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm5, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm10 + pandn %xmm1, %xmm2 + pandn %xmm5, %xmm10 + pand %xmm0, %xmm1 + pand %xmm5, %xmm0 + por %xmm10, %xmm1 + movapd -208(%rbp), %xmm5 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm7, %xmm1 + movaps %xmm0, -160(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm7, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm10 + pandn %xmm1, %xmm2 + pandn %xmm7, %xmm10 + pand %xmm0, %xmm1 + pand %xmm7, %xmm0 + por %xmm10, %xmm1 + movapd -176(%rbp), %xmm7 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm9, %xmm1 + movaps %xmm0, -64(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm9, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm10 + pandn %xmm1, %xmm2 + pandn %xmm9, %xmm10 + pand %xmm0, %xmm1 + pand %xmm9, %xmm0 + por %xmm10, %xmm1 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm12, %xmm1 + movaps %xmm0, -80(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm12, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm10 + pandn %xmm1, %xmm2 + pandn %xmm12, %xmm10 + pand %xmm0, %xmm1 + pand %xmm12, %xmm0 + por %xmm10, %xmm1 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm7, %xmm1 + movaps %xmm0, -96(%rbp) + movdqa %xmm1, %xmm0 + pcmpgtd %xmm7, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm1, %xmm2 + pandn %xmm7, %xmm3 + pand %xmm0, %xmm1 + pand %xmm7, %xmm0 + por %xmm3, %xmm1 + movapd -192(%rbp), %xmm7 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + movdqa %xmm0, %xmm12 + pshufd $177, %xmm14, %xmm0 + movdqa %xmm0, %xmm3 + pcmpgtd %xmm14, %xmm3 + movdqa %xmm3, %xmm1 + movdqa %xmm3, %xmm2 + pandn %xmm0, %xmm1 + pandn %xmm14, %xmm2 + pand %xmm3, %xmm0 + pand %xmm14, %xmm3 + por %xmm2, %xmm0 + por %xmm1, %xmm3 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm3, %xmm3 + punpckldq %xmm0, %xmm3 + pshufd $177, %xmm7, %xmm0 + movdqa %xmm0, %xmm9 + pcmpgtd %xmm7, %xmm9 + movdqa %xmm9, %xmm1 + movdqa %xmm9, %xmm2 + pandn %xmm0, %xmm1 + pandn %xmm7, %xmm2 + pand %xmm9, %xmm0 + pand %xmm7, %xmm9 + por %xmm2, %xmm0 + por %xmm1, %xmm9 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm9, %xmm9 + punpckldq %xmm0, %xmm9 + pshufd $177, %xmm8, %xmm0 + movdqa %xmm0, %xmm7 + pcmpgtd %xmm8, %xmm7 + movdqa %xmm7, %xmm1 + movdqa %xmm7, %xmm2 + pandn %xmm0, %xmm1 + pandn %xmm8, %xmm2 + pand %xmm7, %xmm0 + pand %xmm8, %xmm7 + por %xmm2, %xmm0 + por %xmm1, %xmm7 + pshufd $221, %xmm0, %xmm0 + pshufd $136, %xmm7, %xmm7 + punpckldq %xmm0, %xmm7 + pshufd $177, %xmm5, %xmm0 + movdqa %xmm0, %xmm10 + pcmpgtd %xmm5, %xmm10 + movdqa %xmm10, %xmm1 + movdqa %xmm10, %xmm2 + pandn %xmm0, %xmm1 + pandn %xmm5, %xmm2 + pand %xmm10, %xmm0 + pand %xmm5, %xmm10 + por %xmm2, %xmm0 + por %xmm1, %xmm10 + pshufd $221, %xmm0, %xmm0 + pshufd $177, %xmm6, %xmm1 + pshufd $136, %xmm10, %xmm10 + punpckldq %xmm0, %xmm10 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm6, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm5 + pandn %xmm1, %xmm2 + pandn %xmm6, %xmm5 + pand %xmm0, %xmm1 + pand %xmm6, %xmm0 + por %xmm5, %xmm1 + por %xmm2, %xmm0 + pshufd $221, %xmm1, %xmm1 + pshufd $136, %xmm0, %xmm0 + punpckldq %xmm1, %xmm0 + pshufd $177, %xmm11, %xmm1 + movdqa %xmm1, %xmm13 + pcmpgtd %xmm11, %xmm13 + movdqa %xmm13, %xmm2 + movdqa %xmm13, %xmm5 + pandn %xmm1, %xmm2 + pandn %xmm11, %xmm5 + pand %xmm13, %xmm1 + pand %xmm11, %xmm13 + por %xmm5, %xmm1 + por %xmm2, %xmm13 + pshufd $221, %xmm1, %xmm1 + pshufd $177, %xmm15, %xmm2 + pshufd $136, %xmm13, %xmm13 + punpckldq %xmm1, %xmm13 + movdqa %xmm2, %xmm1 + pcmpgtd %xmm15, %xmm1 + movdqa %xmm1, %xmm5 + movdqa %xmm1, %xmm6 + pandn %xmm2, %xmm5 + pandn %xmm15, %xmm6 + pand %xmm1, %xmm2 + pand %xmm15, %xmm1 + por %xmm6, %xmm2 + por %xmm5, %xmm1 + pshufd $221, %xmm2, %xmm2 + pshufd $177, %xmm4, %xmm5 + pshufd $136, %xmm1, %xmm1 + punpckldq %xmm2, %xmm1 + movdqa %xmm5, %xmm2 + pcmpgtd %xmm4, %xmm2 + movdqa %xmm2, %xmm6 + movdqa %xmm2, %xmm8 + pandn %xmm5, %xmm6 + pandn %xmm4, %xmm8 + pand %xmm2, %xmm5 + pand %xmm4, %xmm2 + por %xmm8, %xmm5 + por %xmm6, %xmm2 + pshufd $221, %xmm5, %xmm5 + pshufd $136, %xmm2, %xmm2 + punpckldq %xmm5, %xmm2 + movdqa %xmm2, %xmm15 +.L478: + movdqa -112(%rbp), %xmm4 + movq -264(%rbp), %rdx + movups %xmm4, (%rdx) + movdqa -128(%rbp), %xmm4 + movups %xmm4, (%r15) + movdqa -144(%rbp), %xmm4 + movups %xmm4, (%r14) + movdqa -160(%rbp), %xmm4 + movups %xmm4, 0(%r13) + movdqa -64(%rbp), %xmm4 + movups %xmm4, (%r12) + movdqa -80(%rbp), %xmm4 + movups %xmm4, (%rbx) + movdqa -96(%rbp), %xmm4 + movq -248(%rbp), %rbx + movups %xmm4, (%r11) + movups %xmm12, (%r10) + movups %xmm3, (%r9) + movups %xmm9, (%r8) + movups %xmm7, (%rdi) + movups %xmm10, (%rsi) + movups %xmm0, (%rcx) + movq -256(%rbp), %rcx + movups %xmm13, (%rbx) + movups %xmm1, (%rcx) + movups %xmm15, (%rax) + addq $240, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L481: + .cfi_restore_state + movdqa -192(%rbp), %xmm15 + movdqa -208(%rbp), %xmm1 + movdqa %xmm2, %xmm9 + jmp .L478 + .p2align 4,,10 + .p2align 3 +.L482: + movdqa %xmm7, %xmm9 + movdqa %xmm8, %xmm3 + movdqa %xmm10, %xmm7 + movdqa %xmm2, %xmm15 + movdqa %xmm5, %xmm10 + jmp .L478 + .cfi_endproc +.LFE18797: + .size _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0, .-_ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + .section .text._ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0, @function +_ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0: +.LFB18798: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbx + subq $6072, %rsp + .cfi_offset 15, -24 + .cfi_offset 14, -32 + .cfi_offset 13, -40 + .cfi_offset 12, -48 + .cfi_offset 3, -56 + movq %rdi, -5768(%rbp) + movq %rsi, -5960(%rbp) + movq %rdx, -5896(%rbp) + movq %rcx, -5880(%rbp) + movq %r8, -5904(%rbp) + movq %r9, -5944(%rbp) + cmpq $64, %rdx + jbe .L769 + movq %rdi, %r13 + movq %rdi, %r14 + shrq $2, %r13 + movq %r13, %rax + andl $15, %eax + jne .L770 + movq %rdx, %r11 + movq %rdi, %r12 + movq %r8, %r15 +.L496: + movq 8(%r15), %rdx + movq 16(%r15), %r9 + movq %rdx, %rax + leaq 1(%r9), %rdi + movq %rdx, %rsi + xorq (%r15), %rdi + rolq $24, %rax + shrq $11, %rsi + leaq (%rdx,%rdx,8), %rcx + leaq 2(%r9), %rdx + addq %rdi, %rax + xorq %rsi, %rcx + movq %rax, %rsi + xorq %rdx, %rcx + leaq (%rax,%rax,8), %rdx + movq %rax, %r8 + rolq $24, %rsi + shrq $11, %r8 + leaq 3(%r9), %rax + addq %rcx, %rsi + xorq %r8, %rdx + movq %rsi, %r10 + xorq %rax, %rdx + leaq (%rsi,%rsi,8), %rax + movq %rsi, %r8 + rolq $24, %r10 + shrq $11, %r8 + leaq 4(%r9), %rsi + addq $5, %r9 + addq %rdx, %r10 + xorq %r8, %rax + movq %r9, 16(%r15) + movq %r10, %r8 + movq %r10, %rbx + xorq %rsi, %rax + shrq $11, %rbx + rolq $24, %r8 + leaq (%r10,%r10,8), %rsi + addq %rax, %r8 + xorq %rbx, %rsi + xorq %r9, %rsi + leaq (%r8,%r8,8), %r10 + movq %r8, %rbx + rolq $24, %r8 + addq %rsi, %r8 + shrq $11, %rbx + movl %edx, %r9d + movl %esi, %esi + xorq %rbx, %r10 + movq %r8, %xmm5 + movq %r11, %rbx + movabsq $68719476719, %r8 + shrq $4, %rbx + cmpq %r8, %r11 + movl $4294967295, %r8d + movl %edi, %r11d + cmova %r8, %rbx + movq %r10, %xmm0 + shrq $32, %rdi + movl %ecx, %r10d + shrq $32, %rdx + movl %eax, %r8d + punpcklqdq %xmm5, %xmm0 + shrq $32, %rcx + imulq %rbx, %r11 + shrq $32, %rax + movups %xmm0, (%r15) + imulq %rbx, %rdi + imulq %rbx, %r10 + imulq %rbx, %rcx + shrq $32, %r11 + imulq %rbx, %r9 + shrq $32, %rdi + salq $6, %r11 + imulq %rbx, %rdx + shrq $32, %r10 + salq $6, %rdi + addq %r12, %r11 + imulq %rbx, %r8 + shrq $32, %rcx + salq $6, %r10 + addq %r12, %rdi + shrq $32, %r9 + salq $6, %rcx + addq %r12, %r10 + shrq $32, %rdx + salq $6, %r9 + addq %r12, %rcx + shrq $32, %r8 + salq $6, %rdx + addq %r12, %r9 + salq $6, %r8 + addq %r12, %rdx + addq %r12, %r8 + imulq %rbx, %rax + imulq %rbx, %rsi + shrq $32, %rax + shrq $32, %rsi + salq $6, %rax + salq $6, %rsi + addq %r12, %rax + leaq (%r12,%rsi), %rbx + movq -5880(%rbp), %r12 + xorl %esi, %esi +.L498: + movdqa (%r10,%rsi,4), %xmm0 + movdqa (%r11,%rsi,4), %xmm3 + movdqa %xmm0, %xmm1 + movdqa %xmm3, %xmm2 + pcmpgtd %xmm3, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm2 + pandn %xmm0, %xmm4 + pand %xmm1, %xmm0 + por %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + movdqa %xmm0, %xmm1 + movdqa (%rdi,%rsi,4), %xmm0 + pandn %xmm3, %xmm4 + pcmpgtd %xmm2, %xmm0 + por %xmm4, %xmm1 + movdqa %xmm0, %xmm3 + pand (%rdi,%rsi,4), %xmm0 + pandn %xmm2, %xmm3 + movdqa %xmm1, %xmm2 + por %xmm3, %xmm0 + pcmpgtd %xmm0, %xmm2 + movdqa %xmm2, %xmm3 + pand %xmm2, %xmm0 + pandn %xmm1, %xmm3 + por %xmm3, %xmm0 + movdqa (%rcx,%rsi,4), %xmm3 + movaps %xmm0, (%r12,%rsi,4) + movdqa (%rdx,%rsi,4), %xmm0 + movdqa %xmm3, %xmm2 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm3, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm2 + pandn %xmm0, %xmm4 + pand %xmm1, %xmm0 + por %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + movdqa %xmm0, %xmm1 + movdqa (%r9,%rsi,4), %xmm0 + pandn %xmm3, %xmm4 + pcmpgtd %xmm2, %xmm0 + por %xmm4, %xmm1 + movdqa %xmm0, %xmm3 + pand (%r9,%rsi,4), %xmm0 + pandn %xmm2, %xmm3 + movdqa %xmm1, %xmm2 + por %xmm3, %xmm0 + pcmpgtd %xmm0, %xmm2 + movdqa %xmm2, %xmm3 + pand %xmm2, %xmm0 + pandn %xmm1, %xmm3 + por %xmm3, %xmm0 + movdqa (%r8,%rsi,4), %xmm3 + movaps %xmm0, 64(%r12,%rsi,4) + movdqa (%rbx,%rsi,4), %xmm0 + movdqa %xmm3, %xmm2 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm3, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm2 + pandn %xmm0, %xmm4 + pand %xmm1, %xmm0 + por %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + movdqa %xmm0, %xmm1 + movdqa (%rax,%rsi,4), %xmm0 + pandn %xmm3, %xmm4 + pcmpgtd %xmm2, %xmm0 + por %xmm4, %xmm1 + movdqa %xmm0, %xmm3 + pand (%rax,%rsi,4), %xmm0 + pandn %xmm2, %xmm3 + movdqa %xmm1, %xmm2 + por %xmm3, %xmm0 + pcmpgtd %xmm0, %xmm2 + movdqa %xmm2, %xmm3 + pand %xmm2, %xmm0 + pandn %xmm1, %xmm3 + por %xmm3, %xmm0 + movaps %xmm0, 128(%r12,%rsi,4) + addq $4, %rsi + cmpq $16, %rsi + jne .L498 + movq -5880(%rbp), %rbx + movd (%rbx), %xmm5 + movdqa 16(%rbx), %xmm1 + movdqa (%rbx), %xmm3 + pshufd $0, %xmm5, %xmm0 + pxor %xmm0, %xmm3 + pxor %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + por %xmm3, %xmm1 + movdqa 32(%rbx), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 48(%rbx), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 64(%rbx), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 80(%rbx), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 96(%rbx), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 112(%rbx), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 128(%rbx), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 144(%rbx), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 160(%rbx), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 176(%rbx), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + pxor %xmm3, %xmm3 + pcmpeqd %xmm3, %xmm1 + movmskps %xmm1, %eax + cmpl $15, %eax + je .L499 + movdqa .LC4(%rip), %xmm0 + movl $4, %esi + movq %rbx, %rdi + leaq 192(%rbx), %r12 + movups %xmm0, 192(%rbx) + movups %xmm0, 208(%rbx) + movups %xmm0, 224(%rbx) + movups %xmm0, 240(%rbx) + movups %xmm0, 256(%rbx) + call _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + movd (%rbx), %xmm5 + pcmpeqd %xmm2, %xmm2 + pshufd $0, %xmm5, %xmm0 + movd 188(%rbx), %xmm5 + pshufd $0, %xmm5, %xmm1 + paddd %xmm1, %xmm2 + pcmpeqd %xmm0, %xmm2 + movmskps %xmm2, %eax + cmpl $15, %eax + jne .L501 + movq -5896(%rbp), %rsi + movq -5768(%rbp), %rdi + leaq -64(%rbp), %rdx + movq %r12, %rcx + call _ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L483 +.L501: + movq -5880(%rbp), %rbx + movl $23, %eax + movl $24, %edx + movl 96(%rbx), %ecx + movq %rbx, %rdi + cmpl %ecx, 92(%rbx) + je .L543 + jmp .L548 + .p2align 4,,10 + .p2align 3 +.L546: + testq %rax, %rax + je .L771 +.L543: + movq %rax, %rdx + subq $1, %rax + movl (%rdi,%rax,4), %esi + cmpl %ecx, %esi + je .L546 + movq -5880(%rbp), %rbx + movq %rbx, %rdi + cmpl %ecx, (%rbx,%rdx,4) + je .L548 + movl %esi, %ecx + jmp .L545 + .p2align 4,,10 + .p2align 3 +.L549: + cmpq $47, %rdx + je .L767 +.L548: + movq %rdx, %rsi + addq $1, %rdx + cmpl %ecx, (%rdi,%rdx,4) + je .L549 + movl $24, %edx + subq $23, %rsi + subq %rax, %rdx + cmpq %rdx, %rsi + jb .L545 +.L767: + movq -5880(%rbp), %rbx + movl (%rbx,%rax,4), %ecx +.L545: + movd %ecx, %xmm6 + pshufd $0, %xmm6, %xmm3 +.L768: + movl $1, -5948(%rbp) +.L542: + cmpq $0, -5944(%rbp) + je .L772 + movq -5896(%rbp), %rax + movq -5768(%rbp), %rsi + movaps %xmm3, -5728(%rbp) + subq $4, %rax + movdqu (%rsi,%rax,4), %xmm5 + movq %rax, %rbx + movq %rax, -5760(%rbp) + andl $15, %ebx + movq %rbx, -5776(%rbp) + movaps %xmm5, -5936(%rbp) + andl $12, %eax + je .L615 + movdqu (%rsi), %xmm1 + pcmpeqd %xmm0, %xmm0 + movdqa %xmm1, %xmm4 + movaps %xmm1, -5696(%rbp) + pcmpgtd %xmm3, %xmm4 + pxor %xmm4, %xmm0 + movaps %xmm4, -5808(%rbp) + movmskps %xmm0, %ebx + movq %rbx, %rdi + salq $4, %rbx + call __popcountdi2@PLT + leaq _ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %rcx + movdqa -5696(%rbp), %xmm1 + movdqa (%rcx,%rbx), %xmm0 + movq %rcx, -5888(%rbp) + movslq %eax, %r15 + movl %eax, -5712(%rbp) + movaps %xmm0, -528(%rbp) + movzbl -520(%rbp), %edx + movd %xmm0, %r9d + movaps %xmm0, -544(%rbp) + andl $15, %r9d + movq %rdx, %rcx + movzbl -535(%rbp), %edx + movaps %xmm0, -512(%rbp) + movzbl -505(%rbp), %eax + movaps %xmm0, -560(%rbp) + andl $15, %ecx + movq %rdx, %rdi + movzbl -550(%rbp), %edx + movaps %xmm0, -496(%rbp) + movzbl -490(%rbp), %r14d + andl $15, %edi + andl $15, %eax + movaps %xmm0, -416(%rbp) + movzbl -415(%rbp), %r10d + andl $15, %edx + movaps %xmm0, -432(%rbp) + andl $15, %r14d + movzbl -430(%rbp), %r11d + movaps %xmm0, -448(%rbp) + movzbl -445(%rbp), %ebx + andl $15, %r10d + movaps %xmm0, -464(%rbp) + movzbl -460(%rbp), %r12d + andl $15, %r11d + movaps %xmm0, -480(%rbp) + movzbl -475(%rbp), %r13d + andl $15, %ebx + movaps %xmm0, -576(%rbp) + andl $15, %r12d + movq %rcx, -5696(%rbp) + andl $15, %r13d + movq %rdi, -5744(%rbp) + movq %rdx, -5784(%rbp) + movzbl -565(%rbp), %edx + movaps %xmm1, -400(%rbp) + movaps %xmm0, -592(%rbp) + movzbl -580(%rbp), %ecx + andl $15, %edx + movd -5712(%rbp), %xmm7 + movaps %xmm0, -608(%rbp) + movzbl -595(%rbp), %esi + movaps %xmm0, -624(%rbp) + movzbl -610(%rbp), %edi + andl $15, %ecx + movaps %xmm0, -640(%rbp) + andl $15, %esi + pshufd $0, %xmm7, %xmm0 + movzbl -400(%rbp,%rax), %eax + movzbl -400(%rbp,%r14), %r14d + movzbl -400(%rbp,%rbx), %ebx + andl $15, %edi + salq $8, %rax + movzbl -625(%rbp), %r8d + movzbl -400(%rbp,%r13), %r13d + orq %r14, %rax + movzbl -400(%rbp,%r12), %r12d + movzbl -400(%rbp,%r11), %r11d + salq $8, %rax + andl $15, %r8d + movzbl -400(%rbp,%r10), %r10d + movzbl -400(%rbp,%rdi), %edi + orq %r13, %rax + movzbl -400(%rbp,%rsi), %esi + movzbl -400(%rbp,%r8), %r8d + salq $8, %rax + movzbl -400(%rbp,%rcx), %ecx + movzbl -400(%rbp,%rdx), %edx + orq %r12, %rax + salq $8, %r8 + movdqa .LC0(%rip), %xmm7 + movzbl -400(%rbp,%r9), %r9d + salq $8, %rax + orq %rbx, %rax + pcmpgtd %xmm7, %xmm0 + movaps %xmm7, -5920(%rbp) + salq $8, %rax + orq %r11, %rax + salq $8, %rax + orq %r10, %rax + salq $8, %rax + orq %rdi, %r8 + movq -5744(%rbp), %rdi + salq $8, %r8 + movq %rax, %rbx + movd %xmm0, %eax + orq %rsi, %r8 + orq %r9, %rbx + salq $8, %r8 + orq %rcx, %r8 + movq -5696(%rbp), %rcx + movq %rbx, -5696(%rbp) + salq $8, %r8 + orq %rdx, %r8 + movq -5784(%rbp), %rdx + salq $8, %r8 + movzbl -400(%rbp,%rdx), %edx + orq %rdx, %r8 + movzbl -400(%rbp,%rdi), %edx + salq $8, %r8 + orq %rdx, %r8 + movzbl -400(%rbp,%rcx), %edx + salq $8, %r8 + orq %rdx, %r8 + testl %eax, %eax + movq %r8, -5688(%rbp) + movdqa -5808(%rbp), %xmm4 + je .L553 + movq -5768(%rbp), %rsi + movdqa -5696(%rbp), %xmm6 + movd %xmm6, (%rsi) +.L553: + pshufd $85, %xmm0, %xmm2 + movd %xmm2, %eax + testl %eax, %eax + je .L554 + pshufd $85, -5696(%rbp), %xmm2 + movq -5768(%rbp), %rax + movd %xmm2, 4(%rax) +.L554: + movdqa %xmm0, %xmm2 + punpckhdq %xmm0, %xmm2 + movd %xmm2, %eax + testl %eax, %eax + je .L555 + movdqa -5696(%rbp), %xmm2 + movq -5768(%rbp), %rax + punpckhdq %xmm2, %xmm2 + movd %xmm2, 8(%rax) +.L555: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L556 + pshufd $255, -5696(%rbp), %xmm0 + movq -5768(%rbp), %rax + movd %xmm0, 12(%rax) +.L556: + movq -5768(%rbp), %rax + movaps %xmm1, -5744(%rbp) + leaq (%rax,%r15,4), %rbx + movq %rbx, -5696(%rbp) + movmskps %xmm4, %ebx + movq %rbx, %rdi + salq $4, %rbx + call __popcountdi2@PLT + movdqa -5744(%rbp), %xmm1 + movslq %eax, %rsi + movq %rsi, -5712(%rbp) + movq -5888(%rbp), %rsi + movaps %xmm1, -384(%rbp) + movdqa (%rsi,%rbx), %xmm0 + movaps %xmm0, -768(%rbp) + movzbl -760(%rbp), %edx + movd %xmm0, %r8d + movaps %xmm0, -784(%rbp) + andl $15, %r8d + movq %rdx, %rcx + movzbl -775(%rbp), %edx + movaps %xmm0, -800(%rbp) + movaps %xmm0, -752(%rbp) + movzbl -745(%rbp), %eax + andl $15, %ecx + movq %rdx, %rdi + movzbl -790(%rbp), %edx + movaps %xmm0, -736(%rbp) + movzbl -730(%rbp), %r13d + movaps %xmm0, -816(%rbp) + andl $15, %edi + andl $15, %eax + movq %rdx, %r15 + movzbl -805(%rbp), %edx + movq %rcx, -5744(%rbp) + andl $15, %r13d + movq %rdi, -5784(%rbp) + andl $15, %r15d + movaps %xmm0, -656(%rbp) + movq %rdx, %r14 + movzbl -655(%rbp), %r9d + movaps %xmm0, -672(%rbp) + movzbl -670(%rbp), %r10d + andl $15, %r14d + movaps %xmm0, -688(%rbp) + movzbl -685(%rbp), %r11d + andl $15, %r9d + movaps %xmm0, -704(%rbp) + movzbl -700(%rbp), %ebx + andl $15, %r10d + movaps %xmm0, -720(%rbp) + movzbl -715(%rbp), %r12d + andl $15, %r11d + movaps %xmm0, -832(%rbp) + movzbl -820(%rbp), %edx + andl $15, %ebx + movaps %xmm0, -848(%rbp) + andl $15, %r12d + movzbl -835(%rbp), %ecx + movaps %xmm0, -864(%rbp) + movzbl -850(%rbp), %esi + andl $15, %edx + movaps %xmm0, -880(%rbp) + movzbl -384(%rbp,%rax), %eax + andl $15, %ecx + movzbl -384(%rbp,%r13), %r13d + movzbl -384(%rbp,%r12), %r12d + movzbl -384(%rbp,%rbx), %ebx + andl $15, %esi + salq $8, %rax + movzbl -865(%rbp), %edi + movzbl -384(%rbp,%r11), %r11d + orq %r13, %rax + movzbl -384(%rbp,%rsi), %esi + movzbl -384(%rbp,%r10), %r10d + salq $8, %rax + andl $15, %edi + movzbl -384(%rbp,%r9), %r9d + movzbl -384(%rbp,%rcx), %ecx + orq %r12, %rax + movzbl -384(%rbp,%rdi), %edi + movzbl -384(%rbp,%rdx), %edx + movzbl -384(%rbp,%r8), %r8d + salq $8, %rax + orq %rbx, %rax + salq $8, %rdi + movq -5784(%rbp), %rbx + salq $8, %rax + orq %r11, %rax + salq $8, %rax + orq %r10, %rax + salq $8, %rax + orq %r9, %rax + salq $8, %rax + orq %rsi, %rdi + salq $8, %rdi + orq %r8, %rax + orq %rcx, %rdi + movq -5744(%rbp), %rcx + movq %rax, -5744(%rbp) + salq $8, %rdi + orq %rdx, %rdi + movzbl -384(%rbp,%r14), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -384(%rbp,%r15), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -384(%rbp,%rbx), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -384(%rbp,%rcx), %edx + movq -5880(%rbp), %rcx + salq $8, %rdi + orq %rdx, %rdi + movq %rdi, -5736(%rbp) + movdqa -5744(%rbp), %xmm6 + movups %xmm6, (%rcx) + testb $8, -5760(%rbp) + je .L557 + movq -5768(%rbp), %rax + pcmpeqd %xmm0, %xmm0 + movdqu 16(%rax), %xmm1 + movdqa %xmm1, %xmm3 + movaps %xmm1, -5744(%rbp) + pcmpgtd -5728(%rbp), %xmm3 + pxor %xmm3, %xmm0 + movaps %xmm3, -5824(%rbp) + movmskps %xmm0, %ebx + movq %rbx, %rdi + salq $4, %rbx + call __popcountdi2@PLT + movq -5888(%rbp), %rsi + movdqa -5744(%rbp), %xmm1 + movl %eax, -5784(%rbp) + movslq %eax, %r14 + movdqa (%rsi,%rbx), %xmm0 + movaps %xmm1, -368(%rbp) + movaps %xmm0, -896(%rbp) + movzbl -895(%rbp), %eax + movd %xmm0, %r13d + movaps %xmm0, -1008(%rbp) + movzbl -1000(%rbp), %edx + andl $15, %r13d + movaps %xmm0, -912(%rbp) + movq %rax, %rbx + movzbl -910(%rbp), %eax + movaps %xmm0, -1024(%rbp) + movq %rdx, %rsi + andl $15, %ebx + movzbl -1015(%rbp), %edx + movaps %xmm0, -992(%rbp) + movq %rax, %r15 + movzbl -985(%rbp), %eax + andl $15, %esi + movaps %xmm0, -976(%rbp) + movq %rdx, %rcx + andl $15, %r15d + movzbl -970(%rbp), %r12d + andl $15, %ecx + andl $15, %eax + movaps %xmm0, -928(%rbp) + movzbl -925(%rbp), %r10d + movaps %xmm0, -944(%rbp) + andl $15, %r12d + movzbl -940(%rbp), %r11d + movaps %xmm0, -960(%rbp) + andl $15, %r10d + movaps %xmm0, -1040(%rbp) + movzbl -1030(%rbp), %edx + andl $15, %r11d + movq %rbx, -5744(%rbp) + movzbl -955(%rbp), %ebx + movq %rsi, -5808(%rbp) + andl $15, %edx + movq %rcx, -5792(%rbp) + andl $15, %ebx + movaps %xmm0, -1056(%rbp) + movzbl -1045(%rbp), %ecx + movd -5784(%rbp), %xmm5 + movaps %xmm0, -1072(%rbp) + movzbl -1060(%rbp), %esi + movaps %xmm0, -1088(%rbp) + movzbl -1075(%rbp), %edi + andl $15, %ecx + movaps %xmm0, -1104(%rbp) + movzbl -1090(%rbp), %r8d + andl $15, %esi + movaps %xmm0, -1120(%rbp) + andl $15, %edi + pshufd $0, %xmm5, %xmm0 + movzbl -368(%rbp,%rax), %eax + movzbl -368(%rbp,%r12), %r12d + andl $15, %r8d + movzbl -368(%rbp,%rbx), %ebx + salq $8, %rax + movzbl -1105(%rbp), %r9d + movzbl -368(%rbp,%r11), %r11d + orq %r12, %rax + movzbl -368(%rbp,%rdi), %edi + movzbl -368(%rbp,%r10), %r10d + salq $8, %rax + andl $15, %r9d + movzbl -368(%rbp,%r8), %r8d + movzbl -368(%rbp,%rsi), %esi + orq %rbx, %rax + movq -5744(%rbp), %rbx + movzbl -368(%rbp,%r9), %r9d + salq $8, %rax + movzbl -368(%rbp,%rcx), %ecx + movzbl -368(%rbp,%rdx), %edx + orq %r11, %rax + salq $8, %r9 + salq $8, %rax + orq %r10, %rax + movzbl -368(%rbp,%r15), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -368(%rbp,%rbx), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -368(%rbp,%r13), %r10d + salq $8, %rax + orq %r8, %r9 + salq $8, %r9 + movq %rax, %rbx + orq %rdi, %r9 + orq %r10, %rbx + salq $8, %r9 + movq %rbx, -5744(%rbp) + orq %rsi, %r9 + movq -5808(%rbp), %rsi + salq $8, %r9 + orq %rcx, %r9 + movq -5792(%rbp), %rcx + salq $8, %r9 + orq %rdx, %r9 + movzbl -368(%rbp,%rcx), %edx + salq $8, %r9 + orq %rdx, %r9 + movzbl -368(%rbp,%rsi), %edx + salq $8, %r9 + orq %rdx, %r9 + movq %r9, -5736(%rbp) + pcmpgtd -5920(%rbp), %xmm0 + movdqa -5824(%rbp), %xmm3 + movd %xmm0, %eax + testl %eax, %eax + je .L558 + movq -5696(%rbp), %rbx + movdqa -5744(%rbp), %xmm7 + movd %xmm7, (%rbx) +.L558: + pshufd $85, %xmm0, %xmm2 + movd %xmm2, %eax + testl %eax, %eax + je .L559 + pshufd $85, -5744(%rbp), %xmm2 + movq -5696(%rbp), %rax + movd %xmm2, 4(%rax) +.L559: + movdqa %xmm0, %xmm2 + punpckhdq %xmm0, %xmm2 + movd %xmm2, %eax + testl %eax, %eax + je .L560 + movdqa -5744(%rbp), %xmm2 + movq -5696(%rbp), %rax + punpckhdq %xmm2, %xmm2 + movd %xmm2, 8(%rax) +.L560: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L561 + pshufd $255, -5744(%rbp), %xmm0 + movq -5696(%rbp), %rax + movd %xmm0, 12(%rax) +.L561: + movq -5696(%rbp), %rax + movmskps %xmm3, %ebx + movaps %xmm1, -5744(%rbp) + movq %rbx, %rdi + salq $4, %rbx + leaq (%rax,%r14,4), %rax + movq %rax, -5696(%rbp) + call __popcountdi2@PLT + movdqa -5744(%rbp), %xmm1 + movslq %eax, %r13 + movq -5888(%rbp), %rax + movaps %xmm1, -352(%rbp) + movdqa (%rax,%rbx), %xmm0 + movaps %xmm0, -1136(%rbp) + movzbl -1135(%rbp), %eax + movd %xmm0, %r14d + movaps %xmm0, -1248(%rbp) + movzbl -1240(%rbp), %edx + andl $15, %r14d + andl $15, %eax + movaps %xmm0, -1152(%rbp) + movq %rax, -5744(%rbp) + movzbl -1150(%rbp), %eax + movq %rdx, %rsi + movaps %xmm0, -1264(%rbp) + movzbl -1255(%rbp), %edx + andl $15, %esi + movq %rax, %r15 + movaps %xmm0, -1232(%rbp) + movzbl -1225(%rbp), %eax + movaps %xmm0, -1216(%rbp) + movq %rdx, %rdi + andl $15, %r15d + movzbl -1210(%rbp), %r12d + andl $15, %edi + andl $15, %eax + movaps %xmm0, -1168(%rbp) + movzbl -1165(%rbp), %r10d + movq %rdi, -5808(%rbp) + andl $15, %r12d + movaps %xmm0, -1184(%rbp) + movzbl -1180(%rbp), %r11d + andl $15, %r10d + movaps %xmm0, -1200(%rbp) + movzbl -1195(%rbp), %ebx + movaps %xmm0, -1280(%rbp) + movzbl -1270(%rbp), %edx + andl $15, %r11d + movaps %xmm0, -1296(%rbp) + movzbl -1285(%rbp), %ecx + andl $15, %ebx + movq %rsi, -5784(%rbp) + andl $15, %edx + movaps %xmm0, -1312(%rbp) + movzbl -1300(%rbp), %esi + andl $15, %ecx + movaps %xmm0, -1328(%rbp) + movzbl -1315(%rbp), %edi + movaps %xmm0, -1344(%rbp) + movzbl -1330(%rbp), %r8d + andl $15, %esi + movaps %xmm0, -1360(%rbp) + movzbl -352(%rbp,%rax), %eax + andl $15, %edi + movzbl -352(%rbp,%r12), %r12d + movzbl -352(%rbp,%rbx), %ebx + movzbl -1345(%rbp), %r9d + andl $15, %r8d + salq $8, %rax + movzbl -352(%rbp,%r11), %r11d + movzbl -352(%rbp,%r10), %r10d + orq %r12, %rax + andl $15, %r9d + movzbl -352(%rbp,%r8), %r8d + movzbl -352(%rbp,%rdi), %edi + salq $8, %rax + movzbl -352(%rbp,%rsi), %esi + movzbl -352(%rbp,%r9), %r9d + orq %rbx, %rax + movq -5744(%rbp), %rbx + movzbl -352(%rbp,%rcx), %ecx + salq $8, %rax + salq $8, %r9 + movzbl -352(%rbp,%rdx), %edx + orq %r11, %rax + salq $8, %rax + orq %r10, %rax + movzbl -352(%rbp,%r15), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -352(%rbp,%rbx), %r10d + movq -5880(%rbp), %rbx + salq $8, %rax + orq %r10, %rax + movzbl -352(%rbp,%r14), %r10d + salq $8, %rax + orq %r8, %r9 + salq $8, %r9 + orq %r10, %rax + orq %rdi, %r9 + movq -5808(%rbp), %rdi + movq %rax, -5744(%rbp) + salq $8, %r9 + movq -5712(%rbp), %rax + orq %rsi, %r9 + movq -5784(%rbp), %rsi + salq $8, %r9 + orq %rcx, %r9 + salq $8, %r9 + orq %rdx, %r9 + movzbl -352(%rbp,%rdi), %edx + salq $8, %r9 + orq %rdx, %r9 + movzbl -352(%rbp,%rsi), %edx + salq $8, %r9 + orq %rdx, %r9 + movq %r9, -5736(%rbp) + movdqa -5744(%rbp), %xmm5 + movups %xmm5, (%rbx,%rax,4) + addq %r13, %rax + cmpq $11, -5776(%rbp) + movq %rax, -5712(%rbp) + jbe .L557 + movq -5768(%rbp), %rbx + pcmpeqd %xmm0, %xmm0 + movdqu 32(%rbx), %xmm1 + movdqa %xmm1, %xmm3 + movaps %xmm1, -5744(%rbp) + pcmpgtd -5728(%rbp), %xmm3 + pxor %xmm3, %xmm0 + movaps %xmm3, -5824(%rbp) + movmskps %xmm0, %ebx + movq %rbx, %rdi + salq $4, %rbx + call __popcountdi2@PLT + movdqa -5744(%rbp), %xmm1 + movslq %eax, %r12 + movq -5888(%rbp), %rax + movl %r12d, -5784(%rbp) + movdqa (%rax,%rbx), %xmm0 + movaps %xmm1, -336(%rbp) + movaps %xmm0, -1376(%rbp) + movzbl -1375(%rbp), %eax + movd %xmm0, %r9d + movaps %xmm0, -1392(%rbp) + andl $15, %r9d + movaps %xmm0, -1488(%rbp) + movq %rax, %r15 + movzbl -1480(%rbp), %edx + movzbl -1390(%rbp), %eax + movaps %xmm0, -1408(%rbp) + andl $15, %r15d + movaps %xmm0, -1504(%rbp) + movq %rax, %r14 + movq %rdx, %rcx + movzbl -1405(%rbp), %eax + movzbl -1495(%rbp), %edx + andl $15, %ecx + andl $15, %r14d + movaps %xmm0, -1472(%rbp) + movaps %xmm0, -1520(%rbp) + movq %rax, %r13 + movzbl -1465(%rbp), %eax + movq %rdx, %rdi + movzbl -1510(%rbp), %edx + movaps %xmm0, -1456(%rbp) + movzbl -1450(%rbp), %ebx + andl $15, %edi + andl $15, %eax + movaps %xmm0, -1424(%rbp) + movzbl -1420(%rbp), %r10d + andl $15, %edx + andl $15, %ebx + andl $15, %r13d + movaps %xmm0, -1440(%rbp) + movq %rcx, -5744(%rbp) + movzbl -1435(%rbp), %r11d + andl $15, %r10d + movq %rdi, -5808(%rbp) + movq %rdx, -5792(%rbp) + andl $15, %r11d + movaps %xmm0, -1536(%rbp) + movzbl -1525(%rbp), %edx + movd -5784(%rbp), %xmm5 + movaps %xmm0, -1552(%rbp) + movzbl -1540(%rbp), %ecx + movaps %xmm0, -1568(%rbp) + movzbl -1555(%rbp), %esi + andl $15, %edx + movaps %xmm0, -1584(%rbp) + movzbl -1570(%rbp), %edi + andl $15, %ecx + movaps %xmm0, -1600(%rbp) + andl $15, %esi + pshufd $0, %xmm5, %xmm0 + movzbl -336(%rbp,%rax), %eax + movzbl -336(%rbp,%rbx), %ebx + movzbl -1585(%rbp), %r8d + andl $15, %edi + salq $8, %rax + movzbl -336(%rbp,%rdi), %edi + movzbl -336(%rbp,%r11), %r11d + orq %rbx, %rax + andl $15, %r8d + movzbl -336(%rbp,%r10), %r10d + movzbl -336(%rbp,%rsi), %esi + salq $8, %rax + movzbl -336(%rbp,%rcx), %ecx + movzbl -336(%rbp,%r8), %r8d + orq %r11, %rax + movzbl -336(%rbp,%rdx), %edx + movzbl -336(%rbp,%r9), %r9d + salq $8, %rax + salq $8, %r8 + orq %r10, %rax + movzbl -336(%rbp,%r13), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -336(%rbp,%r14), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -336(%rbp,%r15), %r10d + salq $8, %rax + orq %r10, %rax + salq $8, %rax + orq %rdi, %r8 + movq -5808(%rbp), %rdi + salq $8, %r8 + orq %r9, %rax + orq %rsi, %r8 + salq $8, %r8 + orq %rcx, %r8 + movq -5744(%rbp), %rcx + movq %rax, -5744(%rbp) + salq $8, %r8 + orq %rdx, %r8 + movq -5792(%rbp), %rdx + salq $8, %r8 + movzbl -336(%rbp,%rdx), %edx + orq %rdx, %r8 + movzbl -336(%rbp,%rdi), %edx + salq $8, %r8 + orq %rdx, %r8 + movzbl -336(%rbp,%rcx), %edx + salq $8, %r8 + orq %rdx, %r8 + movq %r8, -5736(%rbp) + pcmpgtd -5920(%rbp), %xmm0 + movdqa -5824(%rbp), %xmm3 + movd %xmm0, %eax + testl %eax, %eax + je .L562 + movq -5696(%rbp), %rax + movdqa -5744(%rbp), %xmm5 + movd %xmm5, (%rax) +.L562: + pshufd $85, %xmm0, %xmm2 + movd %xmm2, %eax + testl %eax, %eax + je .L563 + pshufd $85, -5744(%rbp), %xmm2 + movq -5696(%rbp), %rax + movd %xmm2, 4(%rax) +.L563: + movdqa %xmm0, %xmm2 + punpckhdq %xmm0, %xmm2 + movd %xmm2, %eax + testl %eax, %eax + je .L564 + movdqa -5744(%rbp), %xmm2 + movq -5696(%rbp), %rax + punpckhdq %xmm2, %xmm2 + movd %xmm2, 8(%rax) +.L564: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L565 + pshufd $255, -5744(%rbp), %xmm0 + movq -5696(%rbp), %rax + movd %xmm0, 12(%rax) +.L565: + movq -5696(%rbp), %rax + movmskps %xmm3, %ebx + movaps %xmm1, -5744(%rbp) + movq %rbx, %rdi + salq $4, %rbx + leaq (%rax,%r12,4), %rax + movq %rax, -5696(%rbp) + call __popcountdi2@PLT + movdqa -5744(%rbp), %xmm1 + movslq %eax, %r8 + movq -5888(%rbp), %rax + movaps %xmm1, -320(%rbp) + movdqa (%rax,%rbx), %xmm0 + movaps %xmm0, -1616(%rbp) + movzbl -1615(%rbp), %eax + movd %xmm0, %r12d + movaps %xmm0, -1728(%rbp) + movzbl -1720(%rbp), %edx + andl $15, %r12d + movq %rax, %r15 + movaps %xmm0, -1632(%rbp) + movzbl -1630(%rbp), %eax + movq %rdx, %rsi + movaps %xmm0, -1744(%rbp) + movzbl -1735(%rbp), %edx + andl $15, %r15d + movq %rax, %r14 + movaps %xmm0, -1648(%rbp) + movzbl -1645(%rbp), %eax + andl $15, %esi + movq %rdx, %rcx + movaps %xmm0, -1760(%rbp) + movzbl -1750(%rbp), %edx + andl $15, %r14d + movq %rax, %r13 + movaps %xmm0, -1712(%rbp) + andl $15, %ecx + movzbl -1705(%rbp), %eax + movaps %xmm0, -1696(%rbp) + movq %rdx, %rdi + andl $15, %r13d + movzbl -1690(%rbp), %ebx + andl $15, %edi + andl $15, %eax + movq %rsi, -5744(%rbp) + movq %rdi, -5808(%rbp) + andl $15, %ebx + movaps %xmm0, -1664(%rbp) + movzbl -1660(%rbp), %r10d + movaps %xmm0, -1680(%rbp) + movzbl -1675(%rbp), %r11d + movaps %xmm0, -1776(%rbp) + movzbl -1765(%rbp), %edx + andl $15, %r10d + movq %rcx, -5784(%rbp) + andl $15, %r11d + movaps %xmm0, -1792(%rbp) + movzbl -1780(%rbp), %ecx + andl $15, %edx + movaps %xmm0, -1808(%rbp) + movzbl -1795(%rbp), %esi + movaps %xmm0, -1824(%rbp) + movzbl -1810(%rbp), %edi + andl $15, %ecx + movaps %xmm0, -1840(%rbp) + movzbl -320(%rbp,%rax), %eax + andl $15, %esi + movzbl -320(%rbp,%rbx), %ebx + movzbl -320(%rbp,%r11), %r11d + movzbl -1825(%rbp), %r9d + andl $15, %edi + salq $8, %rax + movzbl -320(%rbp,%rdi), %edi + movzbl -320(%rbp,%r10), %r10d + orq %rbx, %rax + andl $15, %r9d + movzbl -320(%rbp,%rsi), %esi + movzbl -320(%rbp,%rcx), %ecx + salq $8, %rax + movzbl -320(%rbp,%rdx), %edx + movzbl -320(%rbp,%r9), %r9d + orq %r11, %rax + movq -5880(%rbp), %rbx + salq $8, %rax + salq $8, %r9 + orq %r10, %rax + movzbl -320(%rbp,%r13), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -320(%rbp,%r14), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -320(%rbp,%r15), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -320(%rbp,%r12), %r10d + salq $8, %rax + orq %rdi, %r9 + movq -5808(%rbp), %rdi + salq $8, %r9 + orq %r10, %rax + orq %rsi, %r9 + movq -5744(%rbp), %rsi + movq %rax, -5744(%rbp) + salq $8, %r9 + movq -5712(%rbp), %rax + orq %rcx, %r9 + movq -5784(%rbp), %rcx + salq $8, %r9 + orq %rdx, %r9 + movzbl -320(%rbp,%rdi), %edx + salq $8, %r9 + orq %rdx, %r9 + movzbl -320(%rbp,%rcx), %edx + salq $8, %r9 + orq %rdx, %r9 + movzbl -320(%rbp,%rsi), %edx + salq $8, %r9 + orq %rdx, %r9 + movq %r9, -5736(%rbp) + movdqa -5744(%rbp), %xmm5 + movups %xmm5, (%rbx,%rax,4) + addq %r8, %rax + movq %rax, -5712(%rbp) +.L557: + movq -5776(%rbp), %rbx + leaq -4(%rbx), %rax + leaq 1(%rbx), %rdx + movq -5712(%rbp), %rbx + andq $-4, %rax + addq $4, %rax + cmpq $4, %rdx + movl $4, %edx + cmovbe %rdx, %rax + salq $2, %rbx +.L552: + movq -5776(%rbp), %rcx + cmpq %rax, %rcx + je .L566 + movq -5768(%rbp), %rsi + subq %rax, %rcx + movq %rcx, %xmm1 + movdqu (%rsi,%rax,4), %xmm3 + pshufd $0, %xmm1, %xmm1 + pcmpgtd -5920(%rbp), %xmm1 + movdqa %xmm3, %xmm2 + movaps %xmm3, -5744(%rbp) + pcmpgtd -5728(%rbp), %xmm2 + movaps %xmm1, -5824(%rbp) + movdqa %xmm2, %xmm0 + movaps %xmm2, -5840(%rbp) + pandn %xmm1, %xmm0 + movmskps %xmm0, %r12d + movq %r12, %rdi + salq $4, %r12 + call __popcountdi2@PLT + movq -5888(%rbp), %rsi + movdqa -5744(%rbp), %xmm3 + movl %eax, -5776(%rbp) + movslq %eax, %r14 + movdqa (%rsi,%r12), %xmm0 + movaps %xmm3, -304(%rbp) + movaps %xmm0, -1968(%rbp) + movzbl -1960(%rbp), %edx + movd %xmm0, %r8d + movaps %xmm0, -1856(%rbp) + movzbl -1855(%rbp), %eax + andl $15, %r8d + movaps %xmm0, -1984(%rbp) + movq %rdx, %rcx + movzbl -1975(%rbp), %edx + movaps %xmm0, -1952(%rbp) + movq %rax, %rsi + movzbl -1945(%rbp), %eax + andl $15, %ecx + movaps %xmm0, -1936(%rbp) + movq %rdx, %rdi + andl $15, %esi + movzbl -1930(%rbp), %r13d + andl $15, %edi + andl $15, %eax + movaps %xmm0, -1872(%rbp) + movzbl -1870(%rbp), %r9d + movaps %xmm0, -1888(%rbp) + andl $15, %r13d + movzbl -1885(%rbp), %r10d + movaps %xmm0, -1904(%rbp) + movzbl -1900(%rbp), %r11d + andl $15, %r9d + movaps %xmm0, -1920(%rbp) + movzbl -1915(%rbp), %r12d + andl $15, %r10d + movq %rsi, -5744(%rbp) + andl $15, %r11d + movq %rcx, -5784(%rbp) + andl $15, %r12d + movq %rdi, -5808(%rbp) + movaps %xmm0, -2000(%rbp) + movzbl -1990(%rbp), %edx + movaps %xmm0, -2016(%rbp) + movaps %xmm0, -2032(%rbp) + andl $15, %edx + movaps %xmm0, -2048(%rbp) + movzbl -2035(%rbp), %ecx + movaps %xmm0, -2064(%rbp) + movzbl -2050(%rbp), %esi + movaps %xmm0, -2080(%rbp) + movzbl -304(%rbp,%rax), %eax + movzbl -2065(%rbp), %edi + andl $15, %ecx + movzbl -304(%rbp,%r13), %r13d + movq %rdx, -5792(%rbp) + andl $15, %esi + salq $8, %rax + movzbl -2005(%rbp), %edx + andl $15, %edi + movzbl -304(%rbp,%r12), %r12d + orq %r13, %rax + movzbl -304(%rbp,%r11), %r11d + movzbl -304(%rbp,%r10), %r10d + salq $8, %rax + movzbl -304(%rbp,%rdi), %edi + movq %rdx, %r15 + movzbl -304(%rbp,%r9), %r9d + orq %r12, %rax + movzbl -2020(%rbp), %edx + andl $15, %r15d + movzbl -304(%rbp,%rsi), %esi + salq $8, %rax + movzbl -304(%rbp,%rcx), %ecx + movzbl -304(%rbp,%r8), %r8d + orq %r11, %rax + movq -5744(%rbp), %r11 + andl $15, %edx + salq $8, %rax + movzbl -304(%rbp,%rdx), %edx + orq %r10, %rax + salq $8, %rax + orq %r9, %rax + movzbl -304(%rbp,%r11), %r9d + salq $8, %rax + orq %r9, %rax + salq $8, %rax + salq $8, %rdi + orq %rsi, %rdi + movq -5808(%rbp), %rsi + salq $8, %rdi + orq %rcx, %rdi + movq -5784(%rbp), %rcx + salq $8, %rdi + orq %rdx, %rdi + movzbl -304(%rbp,%r15), %edx + salq $8, %rdi + orq %rdx, %rdi + movq -5792(%rbp), %rdx + salq $8, %rdi + movzbl -304(%rbp,%rdx), %edx + orq %rdx, %rdi + movzbl -304(%rbp,%rsi), %edx + movq %rax, %rsi + salq $8, %rdi + orq %r8, %rsi + orq %rdx, %rdi + movzbl -304(%rbp,%rcx), %edx + movd -5776(%rbp), %xmm7 + movq %rsi, -5744(%rbp) + salq $8, %rdi + movdqa -5824(%rbp), %xmm1 + movdqa -5840(%rbp), %xmm2 + pshufd $0, %xmm7, %xmm0 + orq %rdx, %rdi + pcmpgtd -5920(%rbp), %xmm0 + movq %rdi, -5736(%rbp) + movd %xmm0, %eax + testl %eax, %eax + je .L567 + movq -5696(%rbp), %rax + movdqa -5744(%rbp), %xmm7 + movd %xmm7, (%rax) +.L567: + pshufd $85, %xmm0, %xmm4 + movd %xmm4, %eax + testl %eax, %eax + je .L568 + pshufd $85, -5744(%rbp), %xmm4 + movq -5696(%rbp), %rax + movd %xmm4, 4(%rax) +.L568: + movdqa %xmm0, %xmm4 + punpckhdq %xmm0, %xmm4 + movd %xmm4, %eax + testl %eax, %eax + je .L569 + movdqa -5744(%rbp), %xmm4 + movq -5696(%rbp), %rax + punpckhdq %xmm4, %xmm4 + movd %xmm4, 8(%rax) +.L569: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L570 + pshufd $255, -5744(%rbp), %xmm0 + movq -5696(%rbp), %rax + movd %xmm0, 12(%rax) +.L570: + movq -5696(%rbp), %rax + pand %xmm1, %xmm2 + movaps %xmm3, -5744(%rbp) + movmskps %xmm2, %r12d + leaq (%rax,%r14,4), %rax + movq %r12, %rdi + salq $4, %r12 + movq %rax, -5696(%rbp) + call __popcountdi2@PLT + movdqa -5744(%rbp), %xmm3 + movslq %eax, %r8 + movq -5888(%rbp), %rax + movaps %xmm3, -288(%rbp) + movdqa (%rax,%r12), %xmm0 + movaps %xmm0, -2208(%rbp) + movzbl -2200(%rbp), %edx + movd %xmm0, %r9d + movaps %xmm0, -2224(%rbp) + andl $15, %r9d + movq %rdx, %rsi + movzbl -2215(%rbp), %edx + movaps %xmm0, -2096(%rbp) + movzbl -2095(%rbp), %eax + movaps %xmm0, -2240(%rbp) + andl $15, %esi + movq %rdx, %rcx + movzbl -2230(%rbp), %edx + movq %rax, %r15 + movaps %xmm0, -2112(%rbp) + movzbl -2110(%rbp), %eax + andl $15, %ecx + andl $15, %r15d + movaps %xmm0, -2256(%rbp) + movq %rdx, %rdi + movzbl -2245(%rbp), %edx + movaps %xmm0, -2192(%rbp) + andl $15, %edi + movq %rax, %r14 + movzbl -2185(%rbp), %eax + movaps %xmm0, -2176(%rbp) + movq %rdi, -5784(%rbp) + movq %rdx, %rdi + andl $15, %r14d + movzbl -2170(%rbp), %r13d + andl $15, %edi + andl $15, %eax + movq %rsi, -5744(%rbp) + movq %rcx, -5776(%rbp) + andl $15, %r13d + movq %rdi, -5808(%rbp) + movaps %xmm0, -2128(%rbp) + movzbl -2125(%rbp), %r10d + movaps %xmm0, -2144(%rbp) + movzbl -2140(%rbp), %r11d + movaps %xmm0, -2160(%rbp) + movzbl -2155(%rbp), %r12d + andl $15, %r10d + movaps %xmm0, -2272(%rbp) + andl $15, %r11d + movzbl -2260(%rbp), %edx + movaps %xmm0, -2288(%rbp) + andl $15, %r12d + movzbl -2275(%rbp), %ecx + movaps %xmm0, -2304(%rbp) + movzbl -2290(%rbp), %esi + andl $15, %edx + movaps %xmm0, -2320(%rbp) + movzbl -288(%rbp,%rax), %eax + movzbl -288(%rbp,%r13), %r13d + andl $15, %ecx + movzbl -288(%rbp,%r12), %r12d + movzbl -2305(%rbp), %edi + andl $15, %esi + salq $8, %rax + movzbl -288(%rbp,%rsi), %esi + movzbl -288(%rbp,%r11), %r11d + orq %r13, %rax + andl $15, %edi + movzbl -288(%rbp,%r10), %r10d + movzbl -288(%rbp,%rcx), %ecx + salq $8, %rax + movzbl -288(%rbp,%rdi), %edi + movzbl -288(%rbp,%rdx), %edx + movzbl -288(%rbp,%r9), %r9d + orq %r12, %rax + salq $8, %rax + salq $8, %rdi + orq %r11, %rax + salq $8, %rax + orq %r10, %rax + movzbl -288(%rbp,%r14), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -288(%rbp,%r15), %r10d + movq -5808(%rbp), %r15 + salq $8, %rax + orq %r10, %rax + salq $8, %rax + orq %rsi, %rdi + movq -5744(%rbp), %rsi + salq $8, %rdi + orq %r9, %rax + orq %rcx, %rdi + movq -5776(%rbp), %rcx + movq %rax, -5744(%rbp) + salq $8, %rdi + movq -5880(%rbp), %rax + orq %rdx, %rdi + movzbl -288(%rbp,%r15), %edx + salq $8, %rdi + orq %rdx, %rdi + movq -5784(%rbp), %rdx + salq $8, %rdi + movzbl -288(%rbp,%rdx), %edx + orq %rdx, %rdi + movzbl -288(%rbp,%rcx), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -288(%rbp,%rsi), %edx + salq $8, %rdi + orq %rdx, %rdi + movq %rdi, -5736(%rbp) + movdqa -5744(%rbp), %xmm6 + movups %xmm6, (%rax,%rbx) + addq %r8, -5712(%rbp) + movq -5712(%rbp), %rax + leaq 0(,%rax,4), %rbx +.L566: + movq -5768(%rbp), %rax + movq -5760(%rbp), %rdx + movl %ebx, %ecx + subq -5712(%rbp), %rdx + leaq (%rax,%rdx,4), %rax + cmpl $8, %ebx + jnb .L571 + testb $4, %bl + jne .L773 + testl %ecx, %ecx + jne .L774 +.L572: + movl %ebx, %ecx + cmpl $8, %ebx + jnb .L575 + andl $4, %ebx + jne .L775 + testl %ecx, %ecx + jne .L776 +.L576: + movq -5696(%rbp), %rbx + movq -5760(%rbp), %rdi + movq %rbx, %rax + subq -5768(%rbp), %rax + sarq $2, %rax + subq %rax, %rdi + movq %rax, -5976(%rbp) + movq %rdi, -5968(%rbp) + movq %rdx, %rdi + subq %rax, %rdi + movq %rdi, -5712(%rbp) + leaq (%rbx,%rdi,4), %rax + je .L616 + movdqu (%rbx), %xmm5 + movdqu 16(%rbx), %xmm7 + leaq -64(%rax), %rsi + addq $64, %rbx + movdqu -32(%rbx), %xmm6 + movq %rsi, -5784(%rbp) + movaps %xmm5, -6000(%rbp) + movdqu -16(%rbx), %xmm5 + movaps %xmm7, -6016(%rbp) + movdqu -64(%rax), %xmm7 + movaps %xmm6, -6032(%rbp) + movdqu -48(%rax), %xmm6 + movaps %xmm5, -6048(%rbp) + movdqu -32(%rax), %xmm5 + movaps %xmm7, -6064(%rbp) + movdqu -16(%rax), %xmm7 + movq %rbx, -5776(%rbp) + movaps %xmm6, -6080(%rbp) + movaps %xmm5, -6096(%rbp) + movaps %xmm7, -6112(%rbp) + cmpq %rsi, %rbx + je .L617 + leaq _ZZN3hwy6N_SSE26detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %rax + xorl %r15d, %r15d + movq %rax, -5744(%rbp) + jmp .L583 + .p2align 4,,10 + .p2align 3 +.L778: + movq -5784(%rbp), %rax + movdqu -64(%rax), %xmm4 + movdqu -48(%rax), %xmm3 + prefetcht0 -256(%rax) + subq $64, %rax + movdqu 32(%rax), %xmm2 + movdqu 48(%rax), %xmm1 + movq %rax, -5784(%rbp) +.L582: + movdqa %xmm4, %xmm0 + movq -5744(%rbp), %rbx + movaps %xmm1, -5872(%rbp) + pcmpgtd -5728(%rbp), %xmm0 + movaps %xmm4, -272(%rbp) + movaps %xmm2, -5856(%rbp) + movaps %xmm3, -5840(%rbp) + movmskps %xmm0, %r14d + movq %r14, %rax + salq $4, %rax + movdqa (%rbx,%rax), %xmm0 + movaps %xmm0, -2336(%rbp) + movzbl -2335(%rbp), %eax + movd %xmm0, %esi + movaps %xmm0, -2448(%rbp) + movzbl -2440(%rbp), %edx + andl $15, %esi + andl $15, %eax + movaps %xmm0, -2432(%rbp) + movaps %xmm0, -2464(%rbp) + movq %rdx, %rcx + movzbl -2455(%rbp), %edx + movq %rax, -5808(%rbp) + movzbl -2425(%rbp), %eax + andl $15, %ecx + movaps %xmm0, -2416(%rbp) + movzbl -2410(%rbp), %r13d + andl $15, %edx + andl $15, %eax + movaps %xmm0, -2352(%rbp) + movzbl -2350(%rbp), %r10d + movaps %xmm0, -2368(%rbp) + andl $15, %r13d + movzbl -2365(%rbp), %r11d + movaps %xmm0, -2384(%rbp) + movzbl -2380(%rbp), %ebx + andl $15, %r10d + movaps %xmm0, -2400(%rbp) + movzbl -2395(%rbp), %r12d + andl $15, %r11d + movaps %xmm0, -2480(%rbp) + andl $15, %ebx + movaps %xmm0, -2496(%rbp) + andl $15, %r12d + movq %rsi, -5760(%rbp) + movq %rdx, -5824(%rbp) + movzbl -2470(%rbp), %edx + movq %rcx, -5792(%rbp) + movzbl -2485(%rbp), %ecx + movaps %xmm0, -2512(%rbp) + movzbl -2500(%rbp), %esi + andl $15, %edx + movaps %xmm0, -2528(%rbp) + movzbl -2515(%rbp), %edi + andl $15, %ecx + movaps %xmm0, -2544(%rbp) + movzbl -2530(%rbp), %r8d + andl $15, %esi + movaps %xmm0, -2560(%rbp) + movzbl -272(%rbp,%rax), %eax + andl $15, %edi + movzbl -272(%rbp,%r13), %r13d + movzbl -272(%rbp,%r12), %r12d + andl $15, %r8d + movzbl -272(%rbp,%rbx), %ebx + salq $8, %rax + movzbl -2545(%rbp), %r9d + movzbl -272(%rbp,%r11), %r11d + orq %r13, %rax + movzbl -272(%rbp,%rdi), %edi + movzbl -272(%rbp,%r10), %r10d + salq $8, %rax + andl $15, %r9d + movzbl -272(%rbp,%r8), %r8d + movzbl -272(%rbp,%rsi), %esi + orq %r12, %rax + movzbl -272(%rbp,%rcx), %ecx + movzbl -272(%rbp,%r9), %r9d + salq $8, %rax + movzbl -272(%rbp,%rdx), %edx + orq %rbx, %rax + salq $8, %r9 + salq $8, %rax + orq %r8, %r9 + orq %r11, %rax + movq -5760(%rbp), %r11 + salq $8, %rax + orq %r10, %rax + movq -5808(%rbp), %r10 + salq $8, %rax + movzbl -272(%rbp,%r10), %r10d + orq %r10, %rax + movzbl -272(%rbp,%r11), %r10d + salq $8, %rax + salq $8, %r9 + orq %rdi, %r9 + orq %r10, %rax + movq %r14, %rdi + salq $8, %r9 + movq %rax, -5760(%rbp) + orq %rsi, %r9 + salq $8, %r9 + orq %rcx, %r9 + movq -5792(%rbp), %rcx + salq $8, %r9 + orq %rdx, %r9 + movq -5824(%rbp), %rdx + salq $8, %r9 + movzbl -272(%rbp,%rdx), %edx + orq %rdx, %r9 + movzbl -272(%rbp,%rcx), %edx + salq $8, %r9 + orq %rdx, %r9 + movq %r9, -5752(%rbp) + call __popcountdi2@PLT + movdqa -5840(%rbp), %xmm3 + movdqa -5760(%rbp), %xmm6 + movq -5696(%rbp), %rcx + movq -5712(%rbp), %rsi + cltq + movdqa %xmm3, %xmm0 + movq -5744(%rbp), %rbx + movaps %xmm3, -256(%rbp) + pcmpgtd -5728(%rbp), %xmm0 + movups %xmm6, (%rcx,%r15,4) + leaq -4(%rsi,%r15), %rdx + addq $4, %r15 + subq %rax, %r15 + movups %xmm6, (%rcx,%rdx,4) + movmskps %xmm0, %eax + movq %rax, -5808(%rbp) + salq $4, %rax + movdqa (%rbx,%rax), %xmm0 + movd %xmm0, %edx + movaps %xmm0, -2576(%rbp) + movzbl -2575(%rbp), %eax + andl $15, %edx + movaps %xmm0, -2688(%rbp) + movq %rdx, -5760(%rbp) + movzbl -2680(%rbp), %edx + movq %rax, %r11 + movaps %xmm0, -2672(%rbp) + movzbl -2665(%rbp), %eax + andl $15, %r11d + movaps %xmm0, -2656(%rbp) + movq %rdx, %r8 + movzbl -2650(%rbp), %r14d + andl $15, %r8d + andl $15, %eax + movaps %xmm0, -2592(%rbp) + movaps %xmm0, -2608(%rbp) + andl $15, %r14d + movzbl -2605(%rbp), %ebx + movaps %xmm0, -2624(%rbp) + movzbl -2620(%rbp), %r12d + movaps %xmm0, -2640(%rbp) + movzbl -2635(%rbp), %r13d + andl $15, %ebx + movaps %xmm0, -2704(%rbp) + andl $15, %r12d + movq %r11, -5792(%rbp) + movzbl -2590(%rbp), %r11d + andl $15, %r13d + movq %r8, -5824(%rbp) + movzbl -2695(%rbp), %edx + movaps %xmm0, -2720(%rbp) + movzbl -2710(%rbp), %ecx + andl $15, %r11d + movaps %xmm0, -2736(%rbp) + movzbl -2725(%rbp), %esi + andl $15, %edx + movaps %xmm0, -2752(%rbp) + movzbl -2740(%rbp), %edi + andl $15, %ecx + movaps %xmm0, -2768(%rbp) + movzbl -2755(%rbp), %r8d + andl $15, %esi + movaps %xmm0, -2784(%rbp) + movzbl -2770(%rbp), %r9d + andl $15, %edi + movaps %xmm0, -2800(%rbp) + movzbl -2785(%rbp), %r10d + movzbl -256(%rbp,%rax), %eax + andl $15, %r8d + movzbl -256(%rbp,%r14), %r14d + andl $15, %r9d + movzbl -256(%rbp,%r13), %r13d + andl $15, %r10d + salq $8, %rax + movzbl -256(%rbp,%r9), %r9d + movzbl -256(%rbp,%r12), %r12d + orq %r14, %rax + movzbl -256(%rbp,%rbx), %ebx + movzbl -256(%rbp,%r10), %r10d + salq $8, %rax + movq -5760(%rbp), %r14 + movzbl -256(%rbp,%r8), %r8d + orq %r13, %rax + salq $8, %r10 + movzbl -256(%rbp,%rdi), %edi + movzbl -256(%rbp,%r11), %r11d + orq %r9, %r10 + salq $8, %rax + movzbl -256(%rbp,%rsi), %esi + movzbl -256(%rbp,%rcx), %ecx + orq %r12, %rax + salq $8, %r10 + movzbl -256(%rbp,%rdx), %edx + orq %r8, %r10 + salq $8, %rax + movq -5824(%rbp), %r8 + orq %rbx, %rax + salq $8, %r10 + orq %rdi, %r10 + salq $8, %rax + orq %r11, %rax + salq $8, %r10 + movq -5792(%rbp), %r11 + orq %rsi, %r10 + salq $8, %rax + movzbl -256(%rbp,%r11), %r11d + salq $8, %r10 + orq %rcx, %r10 + orq %r11, %rax + salq $8, %r10 + movzbl -256(%rbp,%r14), %r11d + salq $8, %rax + orq %rdx, %r10 + movzbl -256(%rbp,%r8), %edx + movq -5808(%rbp), %rdi + salq $8, %r10 + movq %rax, %r14 + orq %r11, %r14 + orq %rdx, %r10 + movq %r14, -5760(%rbp) + movq %r10, -5752(%rbp) + call __popcountdi2@PLT + movdqa -5856(%rbp), %xmm2 + movq -5712(%rbp), %rsi + movq -5696(%rbp), %rcx + cltq + movdqa -5760(%rbp), %xmm6 + leaq -8(%rsi,%r15), %rdx + movdqa %xmm2, %xmm0 + movq -5744(%rbp), %rdi + movaps %xmm2, -240(%rbp) + movups %xmm6, (%rcx,%r15,4) + pcmpgtd -5728(%rbp), %xmm0 + movups %xmm6, (%rcx,%rdx,4) + movl $4, %edx + subq %rax, %rdx + movq %rdx, %rbx + addq %r15, %rbx + movmskps %xmm0, %r15d + movq %r15, %rax + salq $4, %rax + movdqa (%rdi,%rax), %xmm0 + movaps %xmm0, -2816(%rbp) + movd %xmm0, %edx + movzbl -2815(%rbp), %eax + andl $15, %edx + movaps %xmm0, -2928(%rbp) + andl $15, %eax + movq %rdx, -5760(%rbp) + movzbl -2920(%rbp), %edx + movaps %xmm0, -2912(%rbp) + movq %rax, -5808(%rbp) + movzbl -2905(%rbp), %eax + movq %rdx, %r8 + movaps %xmm0, -2896(%rbp) + movzbl -2890(%rbp), %r14d + andl $15, %r8d + andl $15, %eax + movaps %xmm0, -2832(%rbp) + movzbl -2830(%rbp), %r10d + movaps %xmm0, -2848(%rbp) + andl $15, %r14d + movzbl -2845(%rbp), %r11d + movaps %xmm0, -2864(%rbp) + movzbl -2860(%rbp), %r12d + andl $15, %r10d + movaps %xmm0, -2880(%rbp) + movzbl -2875(%rbp), %r13d + andl $15, %r11d + movaps %xmm0, -2944(%rbp) + andl $15, %r12d + movq %r8, -5792(%rbp) + movzbl -2935(%rbp), %edx + andl $15, %r13d + movaps %xmm0, -2960(%rbp) + movaps %xmm0, -2976(%rbp) + movq %rdx, %r9 + movzbl -2965(%rbp), %ecx + movzbl -2950(%rbp), %edx + movaps %xmm0, -2992(%rbp) + andl $15, %r9d + movzbl -2980(%rbp), %esi + movaps %xmm0, -3008(%rbp) + andl $15, %edx + andl $15, %ecx + movzbl -2995(%rbp), %edi + movaps %xmm0, -3024(%rbp) + movzbl -3010(%rbp), %r8d + andl $15, %esi + movaps %xmm0, -3040(%rbp) + movzbl -240(%rbp,%rax), %eax + andl $15, %edi + movzbl -240(%rbp,%r14), %r14d + movzbl -240(%rbp,%r13), %r13d + movq %r9, -5824(%rbp) + andl $15, %r8d + salq $8, %rax + movzbl -3025(%rbp), %r9d + movzbl -240(%rbp,%r12), %r12d + orq %r14, %rax + movq -5760(%rbp), %r14 + movzbl -240(%rbp,%r11), %r11d + salq $8, %rax + andl $15, %r9d + movzbl -240(%rbp,%r10), %r10d + movzbl -240(%rbp,%r8), %r8d + orq %r13, %rax + movzbl -240(%rbp,%rdi), %edi + movzbl -240(%rbp,%r9), %r9d + salq $8, %rax + movzbl -240(%rbp,%rsi), %esi + movzbl -240(%rbp,%rcx), %ecx + orq %r12, %rax + salq $8, %r9 + movzbl -240(%rbp,%rdx), %edx + salq $8, %rax + orq %r8, %r9 + orq %r11, %rax + movq -5808(%rbp), %r11 + salq $8, %rax + orq %r10, %rax + movzbl -240(%rbp,%r11), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -240(%rbp,%r14), %r10d + salq $8, %rax + salq $8, %r9 + orq %rdi, %r9 + orq %r10, %rax + movq %r15, %rdi + salq $8, %r9 + orq %rsi, %r9 + salq $8, %r9 + orq %rcx, %r9 + salq $8, %r9 + orq %rdx, %r9 + movq -5824(%rbp), %rdx + salq $8, %r9 + movzbl -240(%rbp,%rdx), %edx + movq -5792(%rbp), %r8 + movq %rax, -5760(%rbp) + orq %rdx, %r9 + movzbl -240(%rbp,%r8), %edx + salq $8, %r9 + orq %rdx, %r9 + movq %r9, -5752(%rbp) + call __popcountdi2@PLT + movdqa -5872(%rbp), %xmm1 + movq -5712(%rbp), %rsi + movq -5696(%rbp), %rcx + cltq + movdqa -5760(%rbp), %xmm6 + movdqa %xmm1, %xmm0 + leaq -12(%rsi,%rbx), %rdx + movq -5744(%rbp), %rdi + subq $16, %rsi + pcmpgtd -5728(%rbp), %xmm0 + movups %xmm6, (%rcx,%rbx,4) + movups %xmm6, (%rcx,%rdx,4) + movl $4, %edx + subq %rax, %rdx + movq %rsi, -5712(%rbp) + movmskps %xmm0, %r15d + addq %rdx, %rbx + movaps %xmm1, -224(%rbp) + movq %r15, %rax + salq $4, %rax + movdqa (%rdi,%rax), %xmm0 + movaps %xmm0, -3056(%rbp) + movzbl -3055(%rbp), %eax + movd %xmm0, %edi + movaps %xmm0, -3168(%rbp) + movzbl -3160(%rbp), %edx + andl $15, %edi + andl $15, %eax + movaps %xmm0, -3072(%rbp) + movzbl -3070(%rbp), %r10d + andl $15, %edx + movaps %xmm0, -3088(%rbp) + movzbl -3085(%rbp), %r11d + movaps %xmm0, -3104(%rbp) + movzbl -3100(%rbp), %r12d + andl $15, %r10d + movaps %xmm0, -3120(%rbp) + movzbl -3115(%rbp), %r13d + andl $15, %r11d + movaps %xmm0, -3136(%rbp) + movzbl -3130(%rbp), %r14d + andl $15, %r12d + movaps %xmm0, -3152(%rbp) + andl $15, %r13d + movq %rdi, -5760(%rbp) + andl $15, %r14d + movq %rax, -5808(%rbp) + movzbl -3145(%rbp), %eax + movq %rdx, -5792(%rbp) + movaps %xmm0, -3184(%rbp) + movzbl -3175(%rbp), %edx + andl $15, %eax + movaps %xmm0, -3200(%rbp) + movq %rdx, %r8 + movaps %xmm0, -3216(%rbp) + movzbl -3190(%rbp), %edx + movzbl -3205(%rbp), %ecx + andl $15, %r8d + movaps %xmm0, -3232(%rbp) + movzbl -3220(%rbp), %esi + movaps %xmm0, -3248(%rbp) + andl $15, %edx + andl $15, %ecx + movzbl -3235(%rbp), %edi + movaps %xmm0, -3264(%rbp) + andl $15, %esi + movaps %xmm0, -3280(%rbp) + movzbl -3265(%rbp), %r9d + andl $15, %edi + movzbl -224(%rbp,%rax), %eax + movq %r8, -5824(%rbp) + movzbl -3250(%rbp), %r8d + movzbl -224(%rbp,%rdi), %edi + movzbl -224(%rbp,%r14), %r14d + andl $15, %r8d + andl $15, %r9d + salq $8, %rax + movzbl -224(%rbp,%r13), %r13d + movzbl -224(%rbp,%r9), %r9d + orq %r14, %rax + movzbl -224(%rbp,%r8), %r8d + salq $8, %rax + movq -5760(%rbp), %r14 + movzbl -224(%rbp,%r12), %r12d + salq $8, %r9 + orq %r13, %rax + movzbl -224(%rbp,%rsi), %esi + movzbl -224(%rbp,%r11), %r11d + orq %r8, %r9 + salq $8, %rax + movzbl -224(%rbp,%rcx), %ecx + movzbl -224(%rbp,%r10), %r10d + salq $8, %r9 + orq %r12, %rax + movzbl -224(%rbp,%rdx), %edx + movq -5824(%rbp), %r8 + orq %rdi, %r9 + salq $8, %rax + movq %r15, %rdi + movl $4, %r15d + salq $8, %r9 + orq %r11, %rax + movq -5808(%rbp), %r11 + orq %rsi, %r9 + salq $8, %rax + salq $8, %r9 + orq %r10, %rax + movzbl -224(%rbp,%r11), %r10d + orq %rcx, %r9 + salq $8, %rax + salq $8, %r9 + orq %r10, %rax + movzbl -224(%rbp,%r14), %r10d + orq %rdx, %r9 + salq $8, %rax + movzbl -224(%rbp,%r8), %edx + salq $8, %r9 + orq %r10, %rax + orq %rdx, %r9 + movq -5792(%rbp), %rdx + movq %rax, -5760(%rbp) + salq $8, %r9 + movzbl -224(%rbp,%rdx), %edx + orq %rdx, %r9 + movq %r9, -5752(%rbp) + call __popcountdi2@PLT + movq -5712(%rbp), %rsi + movq -5696(%rbp), %rcx + movdqa -5760(%rbp), %xmm7 + cltq + leaq (%rbx,%rsi), %rdx + subq %rax, %r15 + movups %xmm7, (%rcx,%rbx,4) + addq %rbx, %r15 + movq -5784(%rbp), %rbx + movups %xmm7, (%rcx,%rdx,4) + cmpq %rbx, -5776(%rbp) + je .L777 +.L583: + movq -5776(%rbp), %rax + subq -5696(%rbp), %rax + sarq $2, %rax + subq %r15, %rax + cmpq $16, %rax + ja .L778 + movq -5776(%rbp), %rax + movdqu (%rax), %xmm4 + movdqu 16(%rax), %xmm3 + prefetcht0 256(%rax) + addq $64, %rax + movdqu -32(%rax), %xmm2 + movdqu -16(%rax), %xmm1 + movq %rax, -5776(%rbp) + jmp .L582 + .p2align 4,,10 + .p2align 3 +.L770: + movq -5896(%rbp), %rbx + movl $16, %edx + movq %r8, %r15 + subq %rax, %rdx + leaq (%rdi,%rdx,4), %r12 + leaq -16(%rax,%rbx), %r11 + jmp .L496 + .p2align 4,,10 + .p2align 3 +.L777: + leaq (%rsi,%r15), %r13 + leaq (%rcx,%r15,4), %r14 + leaq 4(%r15), %rbx +.L580: + movdqa -6000(%rbp), %xmm6 + movq -5744(%rbp), %rdi + movdqa %xmm6, %xmm0 + movaps %xmm6, -208(%rbp) + pcmpgtd -5728(%rbp), %xmm0 + movmskps %xmm0, %r12d + movq %r12, %rax + salq $4, %rax + movdqa (%rdi,%rax), %xmm0 + movaps %xmm0, -3296(%rbp) + movzbl -3295(%rbp), %eax + movd %xmm0, %r8d + movaps %xmm0, -3408(%rbp) + movzbl -3400(%rbp), %edx + andl $15, %r8d + andl $15, %eax + movaps %xmm0, -3312(%rbp) + andl $15, %edx + movq %rax, -5760(%rbp) + movzbl -3310(%rbp), %eax + movaps %xmm0, -3424(%rbp) + movq %rdx, -5808(%rbp) + movzbl -3415(%rbp), %edx + movq %rax, %rcx + movaps %xmm0, -3328(%rbp) + movzbl -3325(%rbp), %eax + andl $15, %ecx + movaps %xmm0, -3440(%rbp) + movq %rdx, %r15 + movzbl -3430(%rbp), %edx + movaps %xmm0, -3392(%rbp) + movq %rax, %rsi + andl $15, %r15d + movzbl -3385(%rbp), %eax + movaps %xmm0, -3376(%rbp) + movzbl -3370(%rbp), %r11d + andl $15, %esi + movq %rcx, -5776(%rbp) + movq %rdx, %rcx + andl $15, %eax + andl $15, %ecx + movaps %xmm0, -3344(%rbp) + andl $15, %r11d + movzbl -3340(%rbp), %r9d + movaps %xmm0, -3360(%rbp) + movzbl -3355(%rbp), %r10d + movaps %xmm0, -3456(%rbp) + andl $15, %r9d + movq %rsi, -5784(%rbp) + andl $15, %r10d + movq %r15, -5792(%rbp) + movq %rcx, -5824(%rbp) + movzbl -3445(%rbp), %edx + movaps %xmm0, -3472(%rbp) + movaps %xmm0, -3488(%rbp) + movq %rdx, %rsi + movzbl -3475(%rbp), %ecx + movzbl -3460(%rbp), %edx + movaps %xmm0, -3504(%rbp) + andl $15, %esi + movaps %xmm0, -3520(%rbp) + movq %rsi, %r15 + andl $15, %ecx + andl $15, %edx + movzbl -208(%rbp,%rax), %eax + movzbl -3505(%rbp), %edi + movzbl -208(%rbp,%r11), %r11d + movzbl -3490(%rbp), %esi + salq $8, %rax + andl $15, %edi + movzbl -208(%rbp,%r10), %r10d + movzbl -208(%rbp,%r9), %r9d + orq %r11, %rax + movq -5760(%rbp), %r11 + movzbl -208(%rbp,%rdi), %edi + andl $15, %esi + salq $8, %rax + movzbl -208(%rbp,%rsi), %esi + movzbl -208(%rbp,%rcx), %ecx + orq %r10, %rax + movq -5776(%rbp), %r10 + salq $8, %rdi + movzbl -208(%rbp,%rdx), %edx + movzbl -208(%rbp,%r8), %r8d + salq $8, %rax + orq %rsi, %rdi + orq %r9, %rax + movq -5784(%rbp), %r9 + salq $8, %rax + movzbl -208(%rbp,%r9), %r9d + orq %r9, %rax + movzbl -208(%rbp,%r10), %r9d + salq $8, %rax + orq %r9, %rax + movzbl -208(%rbp,%r11), %r9d + salq $8, %rax + orq %r9, %rax + salq $8, %rax + salq $8, %rdi + orq %rcx, %rdi + movq -5824(%rbp), %rcx + orq %r8, %rax + salq $8, %rdi + orq %rdx, %rdi + movzbl -208(%rbp,%r15), %edx + movq -5792(%rbp), %r15 + salq $8, %rdi + orq %rdx, %rdi + movzbl -208(%rbp,%rcx), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -208(%rbp,%r15), %edx + salq $8, %rdi + orq %rdx, %rdi + movq -5808(%rbp), %rdx + salq $8, %rdi + movzbl -208(%rbp,%rdx), %edx + movq %rax, -5760(%rbp) + movq %rdi, %rax + movq %r12, %rdi + orq %rdx, %rax + movq %rax, -5752(%rbp) + call __popcountdi2@PLT + movdqa -6016(%rbp), %xmm7 + movq -5744(%rbp), %rdi + cltq + movdqa -5760(%rbp), %xmm6 + movq -5696(%rbp), %rsi + movdqa %xmm7, %xmm0 + subq %rax, %rbx + movaps %xmm7, -192(%rbp) + pcmpgtd -5728(%rbp), %xmm0 + movups %xmm6, (%r14) + movups %xmm6, -16(%rsi,%r13,4) + movmskps %xmm0, %r11d + movq %r11, %rax + salq $4, %rax + movdqa (%rdi,%rax), %xmm0 + movaps %xmm0, -3648(%rbp) + movzbl -3640(%rbp), %edx + movd %xmm0, %r8d + movaps %xmm0, -3536(%rbp) + movzbl -3535(%rbp), %eax + andl $15, %r8d + movaps %xmm0, -3664(%rbp) + movq %rdx, %rcx + movzbl -3655(%rbp), %edx + movaps %xmm0, -3552(%rbp) + movq %rax, %r15 + andl $15, %ecx + movzbl -3550(%rbp), %eax + andl $15, %edx + movaps %xmm0, -3568(%rbp) + andl $15, %r15d + movaps %xmm0, -3680(%rbp) + movq %rax, %r14 + movzbl -3565(%rbp), %eax + movq %rdx, -5776(%rbp) + movzbl -3670(%rbp), %edx + andl $15, %r14d + movaps %xmm0, -3584(%rbp) + movq %rax, %r13 + movzbl -3580(%rbp), %eax + movq %rcx, -5760(%rbp) + movq %rdx, %rcx + andl $15, %r13d + andl $15, %ecx + movaps %xmm0, -3600(%rbp) + movq %rax, %r12 + movzbl -3595(%rbp), %r9d + movaps %xmm0, -3616(%rbp) + movzbl -3610(%rbp), %r10d + andl $15, %r12d + movaps %xmm0, -3632(%rbp) + movzbl -3625(%rbp), %eax + andl $15, %r9d + movq %rcx, -5784(%rbp) + andl $15, %r10d + movaps %xmm0, -3696(%rbp) + movzbl -3685(%rbp), %edx + andl $15, %eax + movaps %xmm0, -3712(%rbp) + andl $15, %edx + movaps %xmm0, -3728(%rbp) + movzbl -3715(%rbp), %ecx + movaps %xmm0, -3744(%rbp) + movzbl -3730(%rbp), %esi + movaps %xmm0, -3760(%rbp) + movzbl -3745(%rbp), %edi + movzbl -192(%rbp,%rax), %eax + andl $15, %ecx + movq %rdx, -5808(%rbp) + movzbl -3700(%rbp), %edx + andl $15, %esi + movzbl -192(%rbp,%r10), %r10d + andl $15, %edi + movzbl -192(%rbp,%r9), %r9d + andl $15, %edx + salq $8, %rax + movzbl -192(%rbp,%rdi), %edi + movzbl -192(%rbp,%rsi), %esi + orq %r10, %rax + movzbl -192(%rbp,%rcx), %ecx + movzbl -192(%rbp,%rdx), %edx + salq $8, %rax + salq $8, %rdi + movzbl -192(%rbp,%r8), %r8d + orq %r9, %rax + orq %rsi, %rdi + movzbl -192(%rbp,%r12), %r9d + salq $8, %rax + salq $8, %rdi + orq %r9, %rax + orq %rcx, %rdi + movq -5760(%rbp), %rcx + movzbl -192(%rbp,%r13), %r9d + salq $8, %rax + salq $8, %rdi + orq %r9, %rax + orq %rdx, %rdi + movzbl -192(%rbp,%r14), %r9d + movq -5808(%rbp), %r14 + salq $8, %rax + salq $8, %rdi + movzbl -192(%rbp,%r14), %edx + orq %r9, %rax + movzbl -192(%rbp,%r15), %r9d + movq -5784(%rbp), %r15 + salq $8, %rax + orq %rdx, %rdi + orq %r9, %rax + movzbl -192(%rbp,%r15), %edx + salq $8, %rdi + salq $8, %rax + orq %r8, %rax + orq %rdx, %rdi + movq -5776(%rbp), %rdx + salq $8, %rdi + movzbl -192(%rbp,%rdx), %edx + orq %rdx, %rdi + movzbl -192(%rbp,%rcx), %edx + movq %rax, -5760(%rbp) + salq $8, %rdi + movq %rdi, %rax + movq %r11, %rdi + orq %rdx, %rax + movq %rax, -5752(%rbp) + call __popcountdi2@PLT + movq -5712(%rbp), %rcx + movdqa -5760(%rbp), %xmm6 + movq -5696(%rbp), %rsi + cltq + movq -5744(%rbp), %rdi + leaq -8(%rcx,%rbx), %rdx + movups %xmm6, (%rsi,%rbx,4) + subq %rax, %rbx + movups %xmm6, (%rsi,%rdx,4) + movdqa -6032(%rbp), %xmm6 + addq $4, %rbx + movdqa %xmm6, %xmm0 + movaps %xmm6, -176(%rbp) + pcmpgtd -5728(%rbp), %xmm0 + movmskps %xmm0, %r11d + movq %r11, %rax + salq $4, %rax + movdqa (%rdi,%rax), %xmm0 + movaps %xmm0, -3776(%rbp) + movzbl -3775(%rbp), %eax + movd %xmm0, %r8d + movaps %xmm0, -3792(%rbp) + andl $15, %r8d + movq %rax, %r15 + movzbl -3790(%rbp), %eax + movaps %xmm0, -3888(%rbp) + movzbl -3880(%rbp), %edx + movaps %xmm0, -3808(%rbp) + andl $15, %r15d + movq %rax, %r14 + movzbl -3805(%rbp), %eax + andl $15, %edx + movaps %xmm0, -3824(%rbp) + movaps %xmm0, -3904(%rbp) + andl $15, %r14d + movq %rdx, -5760(%rbp) + movq %rax, %r13 + movzbl -3895(%rbp), %edx + movzbl -3820(%rbp), %eax + movaps %xmm0, -3872(%rbp) + andl $15, %r13d + movq %rdx, %rcx + movaps %xmm0, -3856(%rbp) + movzbl -3850(%rbp), %r10d + movq %rax, %r12 + movzbl -3865(%rbp), %eax + andl $15, %ecx + movaps %xmm0, -3840(%rbp) + movaps %xmm0, -3920(%rbp) + movzbl -3835(%rbp), %r9d + andl $15, %r10d + andl $15, %r12d + movq %rcx, -5776(%rbp) + andl $15, %eax + movzbl -3910(%rbp), %edx + movaps %xmm0, -3936(%rbp) + andl $15, %r9d + movaps %xmm0, -3952(%rbp) + andl $15, %edx + movaps %xmm0, -3968(%rbp) + movzbl -3955(%rbp), %ecx + movaps %xmm0, -3984(%rbp) + movzbl -3970(%rbp), %esi + movaps %xmm0, -4000(%rbp) + movzbl -176(%rbp,%rax), %eax + andl $15, %ecx + movzbl -176(%rbp,%r10), %r10d + movq %rdx, -5784(%rbp) + movzbl -3925(%rbp), %edx + andl $15, %esi + salq $8, %rax + movzbl -176(%rbp,%rsi), %esi + movzbl -176(%rbp,%r9), %r9d + orq %r10, %rax + movq %rdx, %rdi + movzbl -3940(%rbp), %edx + movzbl -176(%rbp,%rcx), %ecx + movzbl -176(%rbp,%r8), %r8d + salq $8, %rax + andl $15, %edi + orq %r9, %rax + movq %rdi, -5808(%rbp) + andl $15, %edx + movzbl -176(%rbp,%r12), %r9d + movzbl -3985(%rbp), %edi + salq $8, %rax + movq -5808(%rbp), %r10 + orq %r9, %rax + movzbl -176(%rbp,%rdx), %edx + movzbl -176(%rbp,%r13), %r9d + andl $15, %edi + salq $8, %rax + orq %r9, %rax + movzbl -176(%rbp,%rdi), %edi + movzbl -176(%rbp,%r14), %r9d + salq $8, %rax + movq -5784(%rbp), %r14 + orq %r9, %rax + salq $8, %rdi + movzbl -176(%rbp,%r15), %r9d + movq -5776(%rbp), %r15 + orq %rsi, %rdi + salq $8, %rax + orq %r9, %rax + salq $8, %rdi + salq $8, %rax + orq %rcx, %rdi + salq $8, %rdi + orq %r8, %rax + orq %rdx, %rdi + movzbl -176(%rbp,%r10), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -176(%rbp,%r14), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -176(%rbp,%r15), %edx + salq $8, %rdi + orq %rdx, %rdi + movq -5760(%rbp), %rdx + salq $8, %rdi + movzbl -176(%rbp,%rdx), %edx + movq %rax, -5760(%rbp) + movq %rdi, %rax + movq %r11, %rdi + orq %rdx, %rax + movq %rax, -5752(%rbp) + call __popcountdi2@PLT + movdqa -6048(%rbp), %xmm7 + movq -5712(%rbp), %rcx + movq -5696(%rbp), %rsi + cltq + movdqa -5760(%rbp), %xmm6 + movdqa %xmm7, %xmm0 + leaq -12(%rcx,%rbx), %rdx + movq -5744(%rbp), %rdi + movaps %xmm7, -160(%rbp) + pcmpgtd -5728(%rbp), %xmm0 + movups %xmm6, (%rsi,%rbx,4) + movups %xmm6, (%rsi,%rdx,4) + movl $4, %edx + subq %rax, %rdx + movmskps %xmm0, %r11d + addq %rdx, %rbx + movq %r11, %rax + salq $4, %rax + movdqa (%rdi,%rax), %xmm0 + movaps %xmm0, -4016(%rbp) + movzbl -4015(%rbp), %eax + movd %xmm0, %r8d + movaps %xmm0, -4032(%rbp) + andl $15, %r8d + movaps %xmm0, -4128(%rbp) + movq %rax, %r15 + movzbl -4120(%rbp), %edx + movzbl -4030(%rbp), %eax + movaps %xmm0, -4048(%rbp) + andl $15, %r15d + andl $15, %edx + movq %rax, %r14 + movzbl -4045(%rbp), %eax + movaps %xmm0, -4144(%rbp) + movq %rdx, -5760(%rbp) + movzbl -4135(%rbp), %edx + andl $15, %r14d + movaps %xmm0, -4064(%rbp) + movq %rax, %r13 + movzbl -4060(%rbp), %eax + andl $15, %edx + movaps %xmm0, -4080(%rbp) + andl $15, %r13d + movzbl -4075(%rbp), %r9d + movaps %xmm0, -4096(%rbp) + movq %rax, %r12 + movzbl -4090(%rbp), %r10d + movaps %xmm0, -4112(%rbp) + movzbl -4105(%rbp), %eax + andl $15, %r12d + andl $15, %r9d + movq %rdx, -5776(%rbp) + andl $15, %r10d + movaps %xmm0, -4160(%rbp) + movzbl -4150(%rbp), %edx + andl $15, %eax + movaps %xmm0, -4176(%rbp) + movq %rdx, %rcx + movzbl -4165(%rbp), %edx + movaps %xmm0, -4240(%rbp) + andl $15, %ecx + movaps %xmm0, -4192(%rbp) + movq %rdx, %rdi + movaps %xmm0, -4208(%rbp) + movzbl -4180(%rbp), %edx + andl $15, %edi + movaps %xmm0, -4224(%rbp) + movzbl -4210(%rbp), %esi + movzbl -160(%rbp,%rax), %eax + movq %rdi, -5808(%rbp) + movzbl -4225(%rbp), %edi + andl $15, %edx + movq %rcx, -5784(%rbp) + movzbl -4195(%rbp), %ecx + andl $15, %esi + movzbl -160(%rbp,%r10), %r10d + andl $15, %edi + movzbl -160(%rbp,%rsi), %esi + movzbl -160(%rbp,%rdi), %edi + andl $15, %ecx + salq $8, %rax + movzbl -160(%rbp,%r9), %r9d + orq %r10, %rax + movq -5808(%rbp), %r10 + movzbl -160(%rbp,%rcx), %ecx + salq $8, %rax + salq $8, %rdi + movzbl -160(%rbp,%rdx), %edx + movzbl -160(%rbp,%r8), %r8d + orq %rsi, %rdi + orq %r9, %rax + movzbl -160(%rbp,%r12), %r9d + salq $8, %rax + salq $8, %rdi + orq %rcx, %rdi + orq %r9, %rax + movzbl -160(%rbp,%r13), %r9d + salq $8, %rax + salq $8, %rdi + orq %r9, %rax + orq %rdx, %rdi + movzbl -160(%rbp,%r14), %r9d + movq -5784(%rbp), %r14 + movzbl -160(%rbp,%r10), %edx + salq $8, %rax + salq $8, %rdi + orq %r9, %rax + movzbl -160(%rbp,%r15), %r9d + movq -5776(%rbp), %r15 + orq %rdx, %rdi + salq $8, %rax + movzbl -160(%rbp,%r14), %edx + salq $8, %rdi + orq %r9, %rax + orq %rdx, %rdi + salq $8, %rax + movzbl -160(%rbp,%r15), %edx + salq $8, %rdi + orq %r8, %rax + orq %rdx, %rdi + movq -5760(%rbp), %rdx + salq $8, %rdi + movzbl -160(%rbp,%rdx), %edx + movq %rax, -5760(%rbp) + movq %rdi, %rax + movq %r11, %rdi + orq %rdx, %rax + movq %rax, -5752(%rbp) + call __popcountdi2@PLT + movq -5712(%rbp), %rcx + movdqa -5760(%rbp), %xmm6 + movq -5696(%rbp), %rsi + cltq + movq -5744(%rbp), %rdi + leaq -16(%rcx,%rbx), %rdx + movups %xmm6, (%rsi,%rbx,4) + movups %xmm6, (%rsi,%rdx,4) + movdqa -6064(%rbp), %xmm6 + movl $4, %edx + subq %rax, %rdx + movdqa %xmm6, %xmm0 + addq %rdx, %rbx + movaps %xmm6, -144(%rbp) + pcmpgtd -5728(%rbp), %xmm0 + movmskps %xmm0, %r11d + movq %r11, %rax + salq $4, %rax + movdqa (%rdi,%rax), %xmm0 + movaps %xmm0, -4256(%rbp) + movzbl -4255(%rbp), %eax + movd %xmm0, %r8d + movaps %xmm0, -4272(%rbp) + andl $15, %r8d + movq %rax, %r15 + movzbl -4270(%rbp), %eax + movaps %xmm0, -4288(%rbp) + movaps %xmm0, -4368(%rbp) + movzbl -4360(%rbp), %edx + andl $15, %r15d + movq %rax, %r14 + movzbl -4285(%rbp), %eax + movaps %xmm0, -4304(%rbp) + andl $15, %edx + movaps %xmm0, -4352(%rbp) + andl $15, %r14d + movq %rax, %r13 + movzbl -4300(%rbp), %eax + movaps %xmm0, -4384(%rbp) + movq %rdx, -5760(%rbp) + movzbl -4375(%rbp), %edx + andl $15, %r13d + movq %rax, %r12 + movzbl -4345(%rbp), %eax + movaps %xmm0, -4336(%rbp) + movzbl -4330(%rbp), %r10d + andl $15, %edx + movaps %xmm0, -4320(%rbp) + andl $15, %r12d + movzbl -4315(%rbp), %r9d + andl $15, %eax + movq %rdx, -5776(%rbp) + andl $15, %r10d + movaps %xmm0, -4400(%rbp) + movzbl -4390(%rbp), %edx + andl $15, %r9d + movaps %xmm0, -4416(%rbp) + movaps %xmm0, -4432(%rbp) + movq %rdx, %rcx + movzbl -4405(%rbp), %edx + movaps %xmm0, -4448(%rbp) + andl $15, %ecx + movaps %xmm0, -4464(%rbp) + movq %rdx, %rdi + movzbl -4450(%rbp), %esi + movzbl -4420(%rbp), %edx + movaps %xmm0, -4480(%rbp) + movzbl -144(%rbp,%rax), %eax + andl $15, %edi + movzbl -144(%rbp,%r10), %r10d + movzbl -144(%rbp,%r9), %r9d + movq %rdi, -5808(%rbp) + andl $15, %esi + andl $15, %edx + salq $8, %rax + movzbl -4465(%rbp), %edi + movq %rcx, -5784(%rbp) + orq %r10, %rax + movzbl -4435(%rbp), %ecx + movzbl -144(%rbp,%rsi), %esi + salq $8, %rax + andl $15, %edi + movq -5808(%rbp), %r10 + movzbl -144(%rbp,%rdx), %edx + orq %r9, %rax + andl $15, %ecx + movzbl -144(%rbp,%r12), %r9d + movzbl -144(%rbp,%rdi), %edi + salq $8, %rax + movzbl -144(%rbp,%rcx), %ecx + movzbl -144(%rbp,%r8), %r8d + orq %r9, %rax + salq $8, %rdi + movzbl -144(%rbp,%r13), %r9d + salq $8, %rax + orq %rsi, %rdi + orq %r9, %rax + salq $8, %rdi + movzbl -144(%rbp,%r14), %r9d + movq -5784(%rbp), %r14 + salq $8, %rax + orq %r9, %rax + movzbl -144(%rbp,%r15), %r9d + movq -5776(%rbp), %r15 + salq $8, %rax + orq %r9, %rax + salq $8, %rax + orq %rcx, %rdi + salq $8, %rdi + orq %r8, %rax + orq %rdx, %rdi + movzbl -144(%rbp,%r10), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -144(%rbp,%r14), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -144(%rbp,%r15), %edx + salq $8, %rdi + orq %rdx, %rdi + movq -5760(%rbp), %rdx + salq $8, %rdi + movzbl -144(%rbp,%rdx), %edx + movq %rax, -5760(%rbp) + movq %rdi, %rax + movq %r11, %rdi + orq %rdx, %rax + movq %rax, -5752(%rbp) + call __popcountdi2@PLT + movdqa -6080(%rbp), %xmm7 + movq -5712(%rbp), %rcx + movq -5696(%rbp), %rsi + cltq + movdqa -5760(%rbp), %xmm6 + movdqa %xmm7, %xmm0 + leaq -20(%rcx,%rbx), %rdx + movq -5744(%rbp), %rdi + movaps %xmm7, -128(%rbp) + pcmpgtd -5728(%rbp), %xmm0 + movups %xmm6, (%rsi,%rbx,4) + movups %xmm6, (%rsi,%rdx,4) + movl $4, %edx + subq %rax, %rdx + movmskps %xmm0, %r11d + addq %rdx, %rbx + movq %r11, %rax + salq $4, %rax + movdqa (%rdi,%rax), %xmm0 + movaps %xmm0, -4496(%rbp) + movzbl -4495(%rbp), %eax + movd %xmm0, %r8d + movaps %xmm0, -4512(%rbp) + andl $15, %r8d + movaps %xmm0, -4608(%rbp) + movq %rax, %r15 + movzbl -4600(%rbp), %edx + movzbl -4510(%rbp), %eax + movaps %xmm0, -4528(%rbp) + andl $15, %r15d + andl $15, %edx + movq %rax, %r14 + movzbl -4525(%rbp), %eax + movaps %xmm0, -4624(%rbp) + movq %rdx, -5760(%rbp) + movzbl -4615(%rbp), %edx + andl $15, %r14d + movaps %xmm0, -4544(%rbp) + movq %rax, %r13 + movzbl -4540(%rbp), %eax + andl $15, %edx + movaps %xmm0, -4560(%rbp) + andl $15, %r13d + movzbl -4555(%rbp), %r9d + movaps %xmm0, -4576(%rbp) + movq %rax, %r12 + movzbl -4570(%rbp), %r10d + movaps %xmm0, -4592(%rbp) + movzbl -4585(%rbp), %eax + andl $15, %r12d + andl $15, %r9d + movq %rdx, -5776(%rbp) + andl $15, %r10d + movaps %xmm0, -4640(%rbp) + movzbl -4630(%rbp), %edx + andl $15, %eax + movaps %xmm0, -4656(%rbp) + movq %rdx, %rcx + movzbl -4645(%rbp), %edx + movaps %xmm0, -4720(%rbp) + andl $15, %ecx + movaps %xmm0, -4672(%rbp) + movq %rdx, %rdi + movaps %xmm0, -4688(%rbp) + movzbl -4660(%rbp), %edx + andl $15, %edi + movaps %xmm0, -4704(%rbp) + movzbl -4690(%rbp), %esi + movzbl -128(%rbp,%rax), %eax + movq %rdi, -5808(%rbp) + movzbl -4705(%rbp), %edi + andl $15, %edx + movq %rcx, -5784(%rbp) + movzbl -4675(%rbp), %ecx + andl $15, %esi + movzbl -128(%rbp,%r10), %r10d + andl $15, %edi + movzbl -128(%rbp,%rsi), %esi + movzbl -128(%rbp,%rdi), %edi + andl $15, %ecx + salq $8, %rax + movzbl -128(%rbp,%r9), %r9d + orq %r10, %rax + movzbl -128(%rbp,%rcx), %ecx + movq -5808(%rbp), %r10 + salq $8, %rax + salq $8, %rdi + movzbl -128(%rbp,%rdx), %edx + movzbl -128(%rbp,%r8), %r8d + orq %rsi, %rdi + orq %r9, %rax + movzbl -128(%rbp,%r12), %r9d + salq $8, %rax + salq $8, %rdi + orq %rcx, %rdi + orq %r9, %rax + movzbl -128(%rbp,%r13), %r9d + salq $8, %rax + salq $8, %rdi + orq %r9, %rax + orq %rdx, %rdi + movzbl -128(%rbp,%r14), %r9d + movzbl -128(%rbp,%r10), %edx + movq -5784(%rbp), %r14 + salq $8, %rax + salq $8, %rdi + orq %r9, %rax + orq %rdx, %rdi + movzbl -128(%rbp,%r15), %r9d + movq -5776(%rbp), %r15 + movzbl -128(%rbp,%r14), %edx + salq $8, %rdi + salq $8, %rax + orq %r9, %rax + orq %rdx, %rdi + movzbl -128(%rbp,%r15), %edx + salq $8, %rax + salq $8, %rdi + orq %r8, %rax + orq %rdx, %rdi + movq -5760(%rbp), %rdx + salq $8, %rdi + movzbl -128(%rbp,%rdx), %edx + movq %rax, -5760(%rbp) + movq %rdi, %rax + movq %r11, %rdi + orq %rdx, %rax + movq %rax, -5752(%rbp) + call __popcountdi2@PLT + movq -5712(%rbp), %rcx + movdqa -5760(%rbp), %xmm6 + movq -5696(%rbp), %rsi + cltq + movq -5744(%rbp), %rdi + leaq -24(%rcx,%rbx), %rdx + movups %xmm6, (%rsi,%rbx,4) + movups %xmm6, (%rsi,%rdx,4) + movdqa -6096(%rbp), %xmm6 + movl $4, %edx + subq %rax, %rdx + movdqa %xmm6, %xmm0 + addq %rdx, %rbx + movaps %xmm6, -112(%rbp) + pcmpgtd -5728(%rbp), %xmm0 + movmskps %xmm0, %r11d + movq %r11, %rax + salq $4, %rax + movdqa (%rdi,%rax), %xmm0 + movaps %xmm0, -4736(%rbp) + movzbl -4735(%rbp), %eax + movd %xmm0, %r8d + movaps %xmm0, -4752(%rbp) + andl $15, %r8d + movq %rax, %r15 + movzbl -4750(%rbp), %eax + movaps %xmm0, -4768(%rbp) + movaps %xmm0, -4848(%rbp) + movzbl -4840(%rbp), %edx + andl $15, %r15d + movq %rax, %r14 + movzbl -4765(%rbp), %eax + movaps %xmm0, -4784(%rbp) + andl $15, %edx + movaps %xmm0, -4832(%rbp) + andl $15, %r14d + movq %rax, %r13 + movzbl -4780(%rbp), %eax + movaps %xmm0, -4864(%rbp) + movq %rdx, -5760(%rbp) + movzbl -4855(%rbp), %edx + andl $15, %r13d + movq %rax, %r12 + movzbl -4825(%rbp), %eax + movaps %xmm0, -4816(%rbp) + movzbl -4810(%rbp), %r10d + andl $15, %edx + movaps %xmm0, -4800(%rbp) + andl $15, %r12d + movzbl -4795(%rbp), %r9d + andl $15, %eax + movq %rdx, -5776(%rbp) + andl $15, %r10d + movaps %xmm0, -4880(%rbp) + movzbl -4870(%rbp), %edx + andl $15, %r9d + movaps %xmm0, -4896(%rbp) + movaps %xmm0, -4912(%rbp) + movq %rdx, %rcx + movzbl -4885(%rbp), %edx + movaps %xmm0, -4928(%rbp) + andl $15, %ecx + movaps %xmm0, -4944(%rbp) + movq %rdx, %rdi + movzbl -4930(%rbp), %esi + movzbl -4900(%rbp), %edx + movaps %xmm0, -4960(%rbp) + movzbl -112(%rbp,%rax), %eax + movzbl -112(%rbp,%r10), %r10d + andl $15, %edi + movzbl -112(%rbp,%r9), %r9d + movq %rdi, -5808(%rbp) + andl $15, %esi + andl $15, %edx + salq $8, %rax + movzbl -112(%rbp,%rsi), %esi + movzbl -112(%rbp,%rdx), %edx + movq %rcx, -5784(%rbp) + orq %r10, %rax + movzbl -4945(%rbp), %edi + movzbl -4915(%rbp), %ecx + salq $8, %rax + movq -5808(%rbp), %r10 + movzbl -112(%rbp,%r8), %r8d + orq %r9, %rax + movzbl -112(%rbp,%r12), %r9d + andl $15, %edi + andl $15, %ecx + salq $8, %rax + movzbl -112(%rbp,%rdi), %edi + movzbl -112(%rbp,%rcx), %ecx + orq %r9, %rax + movzbl -112(%rbp,%r13), %r9d + salq $8, %rax + salq $8, %rdi + orq %r9, %rax + movzbl -112(%rbp,%r14), %r9d + orq %rsi, %rdi + movq -5784(%rbp), %r14 + salq $8, %rax + salq $8, %rdi + orq %r9, %rax + movzbl -112(%rbp,%r15), %r9d + movq -5776(%rbp), %r15 + salq $8, %rax + orq %r9, %rax + salq $8, %rax + orq %rcx, %rdi + salq $8, %rdi + orq %r8, %rax + orq %rdx, %rdi + movzbl -112(%rbp,%r10), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -112(%rbp,%r14), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -112(%rbp,%r15), %edx + salq $8, %rdi + orq %rdx, %rdi + movq -5760(%rbp), %rdx + salq $8, %rdi + movzbl -112(%rbp,%rdx), %edx + movq %rax, -5760(%rbp) + movq %rdi, %rax + movq %r11, %rdi + orq %rdx, %rax + movq %rax, -5752(%rbp) + call __popcountdi2@PLT + movdqa -6112(%rbp), %xmm7 + movq -5712(%rbp), %rcx + movq -5696(%rbp), %rsi + cltq + movdqa -5760(%rbp), %xmm6 + movdqa %xmm7, %xmm0 + leaq -28(%rcx,%rbx), %rdx + movq -5744(%rbp), %rdi + movaps %xmm7, -96(%rbp) + pcmpgtd -5728(%rbp), %xmm0 + movups %xmm6, (%rsi,%rbx,4) + movups %xmm6, (%rsi,%rdx,4) + movl $4, %edx + subq %rax, %rdx + movmskps %xmm0, %r11d + addq %rdx, %rbx + movq %r11, %rax + salq $4, %rax + movdqa (%rdi,%rax), %xmm0 + movaps %xmm0, -4976(%rbp) + movzbl -4975(%rbp), %eax + movd %xmm0, %r8d + movaps %xmm0, -5088(%rbp) + movzbl -5080(%rbp), %edx + andl $15, %r8d + movaps %xmm0, -4992(%rbp) + movq %rax, %r15 + movzbl -4990(%rbp), %eax + movaps %xmm0, -5104(%rbp) + movq %rdx, %rdi + andl $15, %r15d + movzbl -5095(%rbp), %edx + andl $15, %edi + movaps %xmm0, -5008(%rbp) + movq %rax, %r14 + movzbl -5005(%rbp), %eax + movq %rdi, -5744(%rbp) + movq %rdx, %rdi + andl $15, %r14d + andl $15, %edi + movaps %xmm0, -5024(%rbp) + movq %rax, %r13 + movzbl -5020(%rbp), %r9d + movaps %xmm0, -5040(%rbp) + movzbl -5035(%rbp), %r10d + andl $15, %r13d + movaps %xmm0, -5056(%rbp) + movzbl -5050(%rbp), %r12d + andl $15, %r9d + movaps %xmm0, -5072(%rbp) + movzbl -5065(%rbp), %eax + andl $15, %r10d + movq %rdi, -5760(%rbp) + andl $15, %r12d + movaps %xmm0, -5120(%rbp) + movzbl -5110(%rbp), %edx + andl $15, %eax + movaps %xmm0, -5136(%rbp) + andl $15, %edx + movaps %xmm0, -5200(%rbp) + movq %rdx, -5776(%rbp) + movzbl -5125(%rbp), %edx + movaps %xmm0, -5152(%rbp) + movq %rdx, %rdi + movaps %xmm0, -5168(%rbp) + movzbl -5140(%rbp), %edx + movzbl -5155(%rbp), %ecx + andl $15, %edi + movaps %xmm0, -5184(%rbp) + movzbl -96(%rbp,%rax), %eax + movzbl -5170(%rbp), %esi + movq %rdi, -5784(%rbp) + movzbl -5185(%rbp), %edi + andl $15, %edx + andl $15, %ecx + movzbl -96(%rbp,%r12), %r12d + andl $15, %esi + movzbl -96(%rbp,%rcx), %ecx + andl $15, %edi + salq $8, %rax + movzbl -96(%rbp,%rsi), %esi + movzbl -96(%rbp,%r10), %r10d + movzbl -96(%rbp,%rdi), %edi + orq %r12, %rax + movzbl -96(%rbp,%rdx), %edx + salq $8, %rax + movzbl -96(%rbp,%r9), %r9d + movzbl -96(%rbp,%r8), %r8d + salq $8, %rdi + orq %r10, %rax + movq -5784(%rbp), %r10 + orq %rsi, %rdi + salq $8, %rax + salq $8, %rdi + orq %r9, %rax + movzbl -96(%rbp,%r13), %r9d + orq %rcx, %rdi + salq $8, %rax + salq $8, %rdi + orq %r9, %rax + movzbl -96(%rbp,%r14), %r9d + movq -5776(%rbp), %r14 + orq %rdx, %rdi + movzbl -96(%rbp,%r10), %edx + salq $8, %rax + salq $8, %rdi + orq %r9, %rax + movzbl -96(%rbp,%r15), %r9d + movq -5760(%rbp), %r15 + orq %rdx, %rdi + movzbl -96(%rbp,%r14), %edx + salq $8, %rax + salq $8, %rdi + orq %r9, %rax + orq %rdx, %rdi + movzbl -96(%rbp,%r15), %edx + salq $8, %rax + salq $8, %rdi + orq %r8, %rax + orq %rdx, %rdi + movq -5744(%rbp), %rdx + salq $8, %rdi + movzbl -96(%rbp,%rdx), %edx + movq %rax, -5744(%rbp) + orq %rdx, %rdi + movq %rdi, -5736(%rbp) + movq %r11, %rdi + call __popcountdi2@PLT + movq -5712(%rbp), %rcx + movq -5696(%rbp), %rsi + movdqa -5744(%rbp), %xmm5 + cltq + leaq -32(%rcx,%rbx), %rdx + movq %rsi, %rcx + movups %xmm5, (%rsi,%rbx,4) + movups %xmm5, (%rsi,%rdx,4) + movl $4, %edx + subq %rax, %rdx + leaq (%rdx,%rbx), %rax + movq -5968(%rbp), %rdx + movq %rax, -5712(%rbp) + leaq 0(,%rax,4), %rbx + subq %rax, %rdx +.L579: + movq -5968(%rbp), %rsi + movdqa -5936(%rbp), %xmm2 + cmpq $4, %rdx + pcmpeqd %xmm0, %xmm0 + pcmpgtd -5728(%rbp), %xmm2 + leaq -16(,%rsi,4), %rax + cmovnb %rbx, %rax + pxor %xmm2, %xmm0 + movdqu (%rcx,%rax), %xmm7 + movaps %xmm2, -5808(%rbp) + movmskps %xmm0, %r12d + movups %xmm7, (%rcx,%rsi,4) + movq %r12, %rdi + salq $4, %r12 + movaps %xmm7, -5744(%rbp) + call __popcountdi2@PLT + movdqa -5936(%rbp), %xmm7 + movslq %eax, %rsi + movl %esi, -5760(%rbp) + movq %rsi, -5744(%rbp) + movq -5888(%rbp), %rsi + movaps %xmm7, -80(%rbp) + movdqa (%rsi,%r12), %xmm0 + movaps %xmm0, -5216(%rbp) + movzbl -5215(%rbp), %eax + movd %xmm0, %r8d + movaps %xmm0, -5232(%rbp) + andl $15, %r8d + movaps %xmm0, -5328(%rbp) + movq %rax, %r14 + movzbl -5320(%rbp), %edx + movzbl -5230(%rbp), %eax + movaps %xmm0, -5312(%rbp) + andl $15, %r14d + movaps %xmm0, -5344(%rbp) + movq %rax, %r13 + movq %rdx, %rsi + movzbl -5305(%rbp), %eax + movzbl -5335(%rbp), %edx + andl $15, %esi + movaps %xmm0, -5296(%rbp) + movzbl -5290(%rbp), %r12d + andl $15, %eax + movaps %xmm0, -5248(%rbp) + movzbl -5245(%rbp), %r9d + andl $15, %r13d + andl $15, %edx + movaps %xmm0, -5264(%rbp) + andl $15, %r12d + movzbl -5260(%rbp), %r10d + movaps %xmm0, -5280(%rbp) + movzbl -5275(%rbp), %r11d + andl $15, %r9d + movq %rsi, -5728(%rbp) + andl $15, %r10d + movaps %xmm0, -5360(%rbp) + andl $15, %r11d + movq %rdx, -5776(%rbp) + movzbl -5350(%rbp), %edx + movaps %xmm0, -5376(%rbp) + movaps %xmm0, -5392(%rbp) + movq %rdx, %r15 + movzbl -5365(%rbp), %edx + movaps %xmm0, -5408(%rbp) + andl $15, %r15d + movaps %xmm0, -5424(%rbp) + movq %rdx, %rcx + movzbl -5410(%rbp), %esi + movzbl -5380(%rbp), %edx + movaps %xmm0, -5440(%rbp) + movzbl -80(%rbp,%rax), %eax + andl $15, %ecx + movzbl -80(%rbp,%r12), %r12d + movzbl -80(%rbp,%r11), %r11d + movzbl -80(%rbp,%r10), %r10d + movq %r15, -5784(%rbp) + andl $15, %esi + salq $8, %rax + movzbl -80(%rbp,%r9), %r9d + movq %rcx, %r15 + andl $15, %edx + orq %r12, %rax + movzbl -80(%rbp,%rsi), %esi + movzbl -80(%rbp,%rdx), %edx + salq $8, %rax + movzbl -5425(%rbp), %edi + movzbl -5395(%rbp), %ecx + orq %r11, %rax + movzbl -80(%rbp,%r8), %r8d + salq $8, %rax + andl $15, %edi + andl $15, %ecx + orq %r10, %rax + movzbl -80(%rbp,%rdi), %edi + movzbl -80(%rbp,%rcx), %ecx + salq $8, %rax + orq %r9, %rax + movzbl -80(%rbp,%r13), %r9d + salq $8, %rdi + salq $8, %rax + orq %r9, %rax + movzbl -80(%rbp,%r14), %r9d + salq $8, %rax + orq %r9, %rax + salq $8, %rax + orq %rsi, %rdi + movq -5728(%rbp), %rsi + salq $8, %rdi + orq %rcx, %rdi + salq $8, %rdi + orq %rdx, %rdi + movzbl -80(%rbp,%r15), %edx + movq -5784(%rbp), %r15 + salq $8, %rdi + orq %rdx, %rdi + movzbl -80(%rbp,%r15), %edx + salq $8, %rdi + orq %rdx, %rdi + movq -5776(%rbp), %rdx + salq $8, %rdi + movzbl -80(%rbp,%rdx), %edx + orq %rdx, %rdi + movzbl -80(%rbp,%rsi), %edx + movq %rax, %rsi + orq %r8, %rsi + salq $8, %rdi + movq %rsi, -5728(%rbp) + movd -5760(%rbp), %xmm6 + movq %rdi, %rsi + orq %rdx, %rsi + movdqa -5808(%rbp), %xmm2 + pshufd $0, %xmm6, %xmm0 + movq %rsi, -5720(%rbp) + pcmpgtd -5920(%rbp), %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L585 + movq -5696(%rbp), %rax + movdqa -5728(%rbp), %xmm6 + movd %xmm6, (%rax,%rbx) +.L585: + pshufd $85, %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L586 + pshufd $85, -5728(%rbp), %xmm1 + movq -5696(%rbp), %rax + movd %xmm1, 4(%rax,%rbx) +.L586: + movdqa %xmm0, %xmm1 + punpckhdq %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L587 + movdqa -5728(%rbp), %xmm1 + movq -5696(%rbp), %rax + punpckhdq %xmm1, %xmm1 + movd %xmm1, 8(%rax,%rbx) +.L587: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L588 + pshufd $255, -5728(%rbp), %xmm0 + movq -5696(%rbp), %rax + movd %xmm0, 12(%rax,%rbx) +.L588: + movmskps %xmm2, %r13d + movq -5712(%rbp), %rbx + addq -5744(%rbp), %rbx + movq %r13, %rdi + salq $4, %r13 + leaq 0(,%rbx,4), %r12 + call __popcountdi2@PLT + movq -5888(%rbp), %rsi + movdqa -5936(%rbp), %xmm7 + movl %eax, -5776(%rbp) + movdqa (%rsi,%r13), %xmm0 + movaps %xmm7, -64(%rbp) + movaps %xmm0, -5456(%rbp) + movzbl -5455(%rbp), %eax + movd %xmm0, %r8d + movaps %xmm0, -5568(%rbp) + movzbl -5560(%rbp), %edx + andl $15, %r8d + movaps %xmm0, -5472(%rbp) + movq %rax, %r15 + movzbl -5470(%rbp), %eax + movaps %xmm0, -5584(%rbp) + movq %rdx, %rcx + andl $15, %r15d + movzbl -5575(%rbp), %edx + movaps %xmm0, -5552(%rbp) + movq %rax, %r14 + andl $15, %ecx + movzbl -5545(%rbp), %eax + movaps %xmm0, -5600(%rbp) + movq %rdx, %rdi + andl $15, %r14d + movzbl -5590(%rbp), %edx + movaps %xmm0, -5536(%rbp) + andl $15, %edi + andl $15, %eax + movzbl -5530(%rbp), %r13d + andl $15, %edx + movaps %xmm0, -5488(%rbp) + movzbl -5485(%rbp), %r9d + movaps %xmm0, -5504(%rbp) + andl $15, %r13d + movzbl -5500(%rbp), %r10d + movaps %xmm0, -5520(%rbp) + movzbl -5515(%rbp), %r11d + andl $15, %r9d + movq %rcx, -5712(%rbp) + andl $15, %r10d + movq %rdi, -5728(%rbp) + andl $15, %r11d + movaps %xmm0, -5616(%rbp) + movq %rdx, -5744(%rbp) + movzbl -5605(%rbp), %edx + movaps %xmm0, -5632(%rbp) + movaps %xmm0, -5648(%rbp) + movq %rdx, %rdi + movzbl -5635(%rbp), %ecx + movzbl -5620(%rbp), %edx + movaps %xmm0, -5664(%rbp) + andl $15, %edi + movzbl -5650(%rbp), %esi + movaps %xmm0, -5680(%rbp) + movzbl -64(%rbp,%rax), %eax + movzbl -64(%rbp,%r13), %r13d + andl $15, %edx + movzbl -64(%rbp,%r11), %r11d + movzbl -64(%rbp,%r10), %r10d + andl $15, %esi + andl $15, %ecx + salq $8, %rax + movzbl -64(%rbp,%r9), %r9d + movzbl -64(%rbp,%rsi), %esi + movq %rdi, -5760(%rbp) + orq %r13, %rax + movzbl -64(%rbp,%rcx), %ecx + movzbl -64(%rbp,%rdx), %edx + salq $8, %rax + movzbl -5665(%rbp), %edi + movzbl -64(%rbp,%r8), %r8d + orq %r11, %rax + salq $8, %rax + andl $15, %edi + orq %r10, %rax + movzbl -64(%rbp,%rdi), %edi + salq $8, %rax + orq %r9, %rax + movzbl -64(%rbp,%r14), %r9d + salq $8, %rax + orq %r9, %rax + movzbl -64(%rbp,%r15), %r9d + movq -5760(%rbp), %r15 + salq $8, %rax + orq %r9, %rax + salq $8, %rax + salq $8, %rdi + orq %rsi, %rdi + movq -5728(%rbp), %rsi + orq %r8, %rax + salq $8, %rdi + orq %rcx, %rdi + movq -5712(%rbp), %rcx + movq %rax, -5712(%rbp) + salq $8, %rdi + orq %rdx, %rdi + movzbl -64(%rbp,%r15), %edx + salq $8, %rdi + orq %rdx, %rdi + movq -5744(%rbp), %rdx + salq $8, %rdi + movzbl -64(%rbp,%rdx), %edx + orq %rdx, %rdi + movzbl -64(%rbp,%rsi), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -64(%rbp,%rcx), %edx + salq $8, %rdi + orq %rdx, %rdi + movq %rdi, -5704(%rbp) + movd -5776(%rbp), %xmm6 + pshufd $0, %xmm6, %xmm0 + pcmpgtd -5920(%rbp), %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L589 + movq -5696(%rbp), %rax + movdqa -5712(%rbp), %xmm7 + movd %xmm7, (%rax,%rbx,4) +.L589: + pshufd $85, %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L590 + pshufd $85, -5712(%rbp), %xmm1 + movq -5696(%rbp), %rax + movd %xmm1, 4(%rax,%r12) +.L590: + movdqa %xmm0, %xmm1 + punpckhdq %xmm0, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L591 + movdqa -5712(%rbp), %xmm1 + movq -5696(%rbp), %rax + punpckhdq %xmm1, %xmm1 + movd %xmm1, 8(%rax,%r12) +.L591: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L592 + pshufd $255, -5712(%rbp), %xmm0 + movq -5696(%rbp), %rax + movd %xmm0, 12(%rax,%r12) +.L592: + movq -5944(%rbp), %r12 + addq -5976(%rbp), %rbx + subq $1, %r12 + cmpl $2, -5948(%rbp) + je .L594 + movq -5904(%rbp), %r8 + movq %r12, %r9 + movq %rbx, %rdx + movq -5880(%rbp), %rcx + movq -5960(%rbp), %rsi + movq -5768(%rbp), %rdi + call _ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + cmpl $3, -5948(%rbp) + je .L483 +.L594: + movq -5896(%rbp), %rdx + movq -5768(%rbp), %rax + movq %r12, %r9 + movq -5904(%rbp), %r8 + movq -5880(%rbp), %rcx + movq -5960(%rbp), %rsi + subq %rbx, %rdx + leaq (%rax,%rbx,4), %rdi + call _ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L483: + addq $6072, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L575: + .cfi_restore_state + movq -5880(%rbp), %r15 + leaq 8(%rax), %rdi + andq $-8, %rdi + movq (%r15), %rcx + movq %rcx, (%rax) + movl %ebx, %ecx + movq -8(%r15,%rcx), %rsi + movq %rsi, -8(%rax,%rcx) + subq %rdi, %rax + movq %r15, %rsi + leal (%rbx,%rax), %ecx + subq %rax, %rsi + shrl $3, %ecx + rep movsq + jmp .L576 + .p2align 4,,10 + .p2align 3 +.L571: + movq -5696(%rbp), %rdi + movq (%rax), %rcx + movq %rcx, (%rdi) + movl %ebx, %ecx + movq -8(%rax,%rcx), %rsi + movq %rsi, -8(%rdi,%rcx) + movq %rdi, %rcx + leaq 8(%rdi), %rdi + movq %rax, %rsi + andq $-8, %rdi + subq %rdi, %rcx + subq %rcx, %rsi + addl %ebx, %ecx + shrl $3, %ecx + rep movsq + jmp .L572 +.L771: + movq -5880(%rbp), %rax + movl (%rax), %ecx + jmp .L545 +.L776: + movq -5880(%rbp), %rbx + movzbl (%rbx), %esi + movb %sil, (%rax) + testb $2, %cl + je .L576 + movq -5880(%rbp), %rbx + movzwl -2(%rbx,%rcx), %esi + movw %si, -2(%rax,%rcx) + jmp .L576 +.L774: + movzbl (%rax), %esi + movq -5696(%rbp), %rdi + movb %sil, (%rdi) + testb $2, %cl + je .L572 + movzwl -2(%rax,%rcx), %esi + movq -5696(%rbp), %rdi + movw %si, -2(%rdi,%rcx) + jmp .L572 +.L769: + cmpq $1, %rdx + jbe .L483 + movq %rdi, %rax + addq $256, %rax + cmpq %rax, %rsi + jb .L779 + movl $4, %esi + call _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + jmp .L483 +.L499: + movq -5768(%rbp), %rax + andl $3, %r13d + movl $4, %edi + movdqa .LC0(%rip), %xmm7 + subq %r13, %rdi + movdqu (%rax), %xmm6 + movaps %xmm7, -5920(%rbp) + movaps %xmm6, -5696(%rbp) + movd %edi, %xmm6 + movdqa -5696(%rbp), %xmm1 + pshufd $0, %xmm6, %xmm3 + pcmpeqd %xmm0, %xmm1 + pcmpgtd %xmm7, %xmm3 + pandn %xmm3, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + jne .L780 + movq -5768(%rbp), %rax + pxor %xmm3, %xmm3 + movq -5896(%rbp), %r8 + pxor %xmm5, %xmm5 + movdqa %xmm3, %xmm1 + leaq 256(%rax,%rdi,4), %rsi + .p2align 4,,10 + .p2align 3 +.L505: + movq %rdi, %rcx + leaq 64(%rdi), %rdi + cmpq %rdi, %r8 + jb .L781 + leaq -256(%rsi), %rax +.L504: + movdqa (%rax), %xmm4 + leaq 32(%rax), %rdx + pxor %xmm2, %xmm4 + por %xmm4, %xmm1 + movdqa 16(%rax), %xmm4 + pxor %xmm2, %xmm4 + por %xmm4, %xmm3 + movdqa 32(%rax), %xmm4 + pxor %xmm2, %xmm4 + por %xmm4, %xmm1 + movdqa 48(%rax), %xmm4 + pxor %xmm2, %xmm4 + por %xmm4, %xmm3 + movdqa 64(%rax), %xmm4 + pxor %xmm2, %xmm4 + por %xmm4, %xmm1 + movdqa 80(%rax), %xmm4 + leaq 96(%rdx), %rax + pxor %xmm2, %xmm4 + por %xmm4, %xmm3 + movdqa 64(%rdx), %xmm4 + pxor %xmm2, %xmm4 + por %xmm4, %xmm1 + movdqa 80(%rdx), %xmm4 + pxor %xmm2, %xmm4 + por %xmm4, %xmm3 + cmpq %rsi, %rax + jne .L504 + movdqa %xmm1, %xmm4 + leaq 352(%rdx), %rsi + por %xmm3, %xmm4 + pcmpeqd %xmm5, %xmm4 + movmskps %xmm4, %eax + cmpl $15, %eax + je .L505 + movq -5768(%rbp), %rax + movdqa %xmm0, %xmm1 + pcmpeqd %xmm2, %xmm2 + movq -5768(%rbp), %rdx + pcmpeqd (%rax,%rcx,4), %xmm1 + pxor %xmm2, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + jne .L507 + .p2align 4,,10 + .p2align 3 +.L506: + addq $4, %rcx + movdqa %xmm0, %xmm1 + pcmpeqd (%rdx,%rcx,4), %xmm1 + pxor %xmm2, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + je .L506 +.L507: + rep bsfl %eax, %eax + cltq + addq %rcx, %rax +.L503: + movq -5768(%rbp), %rcx + leaq (%rcx,%rax,4), %rdi + movl (%rdi), %r12d + movd %r12d, %xmm7 + pshufd $0, %xmm7, %xmm2 + movdqa %xmm2, %xmm1 + movaps %xmm2, -5696(%rbp) + pcmpgtd %xmm0, %xmm1 + movmskps %xmm1, %edx + testl %edx, %edx + jne .L512 + movq -5896(%rbp), %r15 + movl %r12d, -5760(%rbp) + xorl %ebx, %ebx + movq %rcx, %r12 + movaps %xmm0, -5744(%rbp) + leaq -4(%r15), %r13 + movaps %xmm0, -5712(%rbp) + movaps %xmm2, -5728(%rbp) + jmp .L521 + .p2align 4,,10 + .p2align 3 +.L513: + movdqa -5744(%rbp), %xmm5 + movmskps %xmm1, %edi + movups %xmm5, (%r12,%r13,4) + call __popcountdi2@PLT + cltq + addq %rax, %rbx + leaq -4(%r13), %rax + cmpq %rax, %r15 + jbe .L782 + movq %rax, %r13 +.L521: + movdqu (%r12,%r13,4), %xmm1 + movdqu (%r12,%r13,4), %xmm4 + pcmpeqd -5712(%rbp), %xmm1 + pcmpeqd -5728(%rbp), %xmm4 + movdqa %xmm1, %xmm5 + movdqa %xmm1, %xmm6 + por %xmm4, %xmm5 + movmskps %xmm5, %eax + cmpl $15, %eax + je .L513 + pcmpeqd %xmm1, %xmm1 + movq -5768(%rbp), %rsi + movq -5896(%rbp), %rdx + leaq 4(%r13), %rcx + pxor %xmm1, %xmm6 + movdqa -5744(%rbp), %xmm3 + movdqa -5712(%rbp), %xmm0 + pandn %xmm6, %xmm4 + subq %rbx, %rdx + movl -5760(%rbp), %r12d + movdqa -5728(%rbp), %xmm2 + movmskps %xmm4, %eax + rep bsfl %eax, %eax + cltq + addq %r13, %rax + movd (%rsi,%rax,4), %xmm7 + leaq 8(%r13), %rax + pshufd $0, %xmm7, %xmm1 + movdqa %xmm1, %xmm4 + movaps %xmm1, -64(%rbp) + cmpq %rax, %rdx + jb .L514 +.L515: + movdqa -5696(%rbp), %xmm6 + movq %rax, %rcx + movups %xmm6, -16(%rsi,%rax,4) + addq $4, %rax + cmpq %rax, %rdx + jnb .L515 +.L514: + subq %rcx, %rdx + leaq 0(,%rcx,4), %rsi + movq %rdx, %xmm1 + pshufd $0, %xmm1, %xmm1 + pcmpgtd -5920(%rbp), %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L516 + movq -5768(%rbp), %rax + movl %r12d, (%rax,%rcx,4) +.L516: + pshufd $85, %xmm1, %xmm5 + movd %xmm5, %eax + testl %eax, %eax + je .L517 + movq -5768(%rbp), %rax + pshufd $85, %xmm2, %xmm5 + movd %xmm5, 4(%rax,%rsi) +.L517: + movdqa %xmm1, %xmm5 + punpckhdq %xmm1, %xmm5 + movd %xmm5, %eax + testl %eax, %eax + je .L518 + movq -5768(%rbp), %rax + movdqa %xmm2, %xmm5 + punpckhdq %xmm2, %xmm5 + movd %xmm5, 8(%rax,%rsi) +.L518: + pshufd $255, %xmm1, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L520 + movq -5768(%rbp), %rax + pshufd $255, %xmm2, %xmm1 + movd %xmm1, 12(%rax,%rsi) +.L520: + movdqa %xmm0, %xmm1 + pcmpeqd .LC5(%rip), %xmm1 + movmskps %xmm1, %eax + cmpl $15, %eax + je .L612 + movdqa %xmm0, %xmm1 + pcmpeqd .LC6(%rip), %xmm1 + movmskps %xmm1, %eax + cmpl $15, %eax + je .L538 + movdqa %xmm4, %xmm1 + pcmpgtd %xmm2, %xmm1 + movdqa %xmm1, %xmm6 + movdqa %xmm1, %xmm5 + pandn %xmm4, %xmm6 + pand %xmm2, %xmm5 + por %xmm6, %xmm5 + movdqa %xmm0, %xmm6 + pcmpgtd %xmm5, %xmm6 + movmskps %xmm6, %eax + testl %eax, %eax + jne .L783 + movdqa %xmm3, %xmm1 + movl $64, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L533: + movq -5768(%rbp), %rbx + leaq (%rcx,%rax,4), %rdx + addq $1, %rax + movdqu (%rbx,%rdx,4), %xmm4 + movdqa %xmm4, %xmm2 + pcmpgtd %xmm1, %xmm2 + movdqa %xmm2, %xmm5 + pand %xmm2, %xmm1 + pandn %xmm4, %xmm5 + por %xmm5, %xmm1 + cmpq $16, %rax + jne .L533 + movdqa %xmm0, %xmm2 + pcmpgtd %xmm1, %xmm2 + movmskps %xmm2, %eax + testl %eax, %eax + jne .L768 + leaq 64(%rsi), %rax + cmpq %rax, -5896(%rbp) + jb .L784 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L533 +.L615: + leaq _ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %rdi + movq %rsi, -5696(%rbp) + xorl %ebx, %ebx + movdqa .LC0(%rip), %xmm5 + movq $0, -5712(%rbp) + movq %rdi, -5888(%rbp) + movaps %xmm5, -5920(%rbp) + jmp .L552 +.L616: + movq -5968(%rbp), %rdx + movq -5696(%rbp), %rcx + xorl %ebx, %ebx + jmp .L579 +.L617: + leaq _ZZN3hwy6N_SSE26detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %rax + movq -5696(%rbp), %r14 + movq %rdi, %r13 + movl $4, %ebx + movq %rax, -5744(%rbp) + jmp .L580 +.L772: + movq -5896(%rbp), %rsi + movq -5768(%rbp), %rdi + leaq -1(%rsi), %rbx + movq %rbx, %r12 + shrq %r12 + .p2align 4,,10 + .p2align 3 +.L550: + movq %r12, %rdx + call _ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + subq $1, %r12 + jnb .L550 + .p2align 4,,10 + .p2align 3 +.L551: + movl (%rdi,%rbx,4), %edx + movl (%rdi), %eax + movq %rbx, %rsi + movl %edx, (%rdi) + xorl %edx, %edx + movl %eax, (%rdi,%rbx,4) + call _ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + subq $1, %rbx + jne .L551 + jmp .L483 +.L775: + movq -5880(%rbp), %rbx + movl (%rbx), %esi + movl %esi, (%rax) + movl -4(%rbx,%rcx), %esi + movl %esi, -4(%rax,%rcx) + jmp .L576 +.L773: + movl (%rax), %esi + movq -5696(%rbp), %rdi + movl %esi, (%rdi) + movl -4(%rax,%rcx), %esi + movl %esi, -4(%rdi,%rcx) + jmp .L572 +.L781: + movq -5768(%rbp), %rsi + movq -5896(%rbp), %rdi + pcmpeqd %xmm2, %xmm2 +.L509: + movq %rcx, %rdx + addq $4, %rcx + cmpq %rcx, %rdi + jb .L785 + movdqa %xmm0, %xmm1 + pcmpeqd -16(%rsi,%rcx,4), %xmm1 + pxor %xmm2, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + je .L509 +.L766: + rep bsfl %eax, %eax + cltq + addq %rdx, %rax + jmp .L503 +.L512: + movq -5896(%rbp), %rsi + movq -5880(%rbp), %rbx + leaq -64(%rbp), %rdx + movdqa %xmm2, %xmm1 + movaps %xmm2, -5696(%rbp) + subq %rax, %rsi + movq %rbx, %rcx + call _ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L483 + movd (%rbx), %xmm7 + movdqa -64(%rbp), %xmm4 + movdqa -5696(%rbp), %xmm2 + pshufd $0, %xmm7, %xmm0 + movdqa %xmm0, %xmm3 + jmp .L520 +.L782: + movq -5768(%rbp), %rax + movdqa -5712(%rbp), %xmm0 + movdqa -5728(%rbp), %xmm2 + movq -5896(%rbp), %r15 + movdqu (%rax), %xmm7 + movdqa -5744(%rbp), %xmm3 + movl -5760(%rbp), %r12d + subq %rbx, %r15 + movaps %xmm7, -5712(%rbp) + movd %r13d, %xmm7 + movdqa -5712(%rbp), %xmm1 + movdqa -5712(%rbp), %xmm5 + pshufd $0, %xmm7, %xmm4 + pcmpgtd -5920(%rbp), %xmm4 + pcmpeqd %xmm0, %xmm1 + pcmpeqd %xmm2, %xmm5 + movdqa %xmm4, %xmm6 + pand %xmm1, %xmm6 + por %xmm5, %xmm1 + pcmpeqd %xmm5, %xmm5 + pxor %xmm5, %xmm4 + por %xmm4, %xmm1 + movmskps %xmm1, %eax + cmpl $15, %eax + jne .L786 + movmskps %xmm6, %edi + movaps %xmm2, -5728(%rbp) + movaps %xmm3, -5712(%rbp) + call __popcountdi2@PLT + movq -5768(%rbp), %rbx + movdqa -5712(%rbp), %xmm3 + movslq %eax, %rdx + movq %r15, %rax + movdqa -5728(%rbp), %xmm2 + subq %rdx, %rax + movups %xmm3, (%rbx) + cmpq $3, %rax + jbe .L601 + leaq -4(%rax), %rdx + movq %rdx, %rcx + shrq $2, %rcx + salq $4, %rcx + leaq 16(%rbx,%rcx), %rcx +.L530: + movdqa -5696(%rbp), %xmm6 + addq $16, %r14 + movups %xmm6, -16(%r14) + cmpq %rcx, %r14 + jne .L530 + andq $-4, %rdx + addq $4, %rdx + leaq 0(,%rdx,4), %rcx + subq %rdx, %rax +.L529: + movq -5880(%rbp), %rsi + movaps %xmm2, (%rsi) + testq %rax, %rax + je .L483 + movq -5768(%rbp), %rdi + leaq 0(,%rax,4), %rdx + addq %rcx, %rdi + call memcpy@PLT + jmp .L483 +.L784: + movq -5896(%rbp), %rcx + movq %rbx, %rdx + jmp .L540 +.L541: + movdqu -16(%rdx,%rsi,4), %xmm5 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm5, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + jne .L768 +.L540: + movq %rsi, %rax + addq $4, %rsi + cmpq %rsi, %rcx + jnb .L541 + movq -5896(%rbp), %rbx + cmpq %rax, %rbx + je .L612 + movq -5768(%rbp), %rax + movdqu -16(%rax,%rbx,4), %xmm7 + movaps %xmm7, -5696(%rbp) + pcmpgtd -5696(%rbp), %xmm0 + movmskps %xmm0, %eax + cmpl $1, %eax + movl $1, %eax + adcl $0, %eax + movl %eax, -5948(%rbp) + jmp .L542 +.L785: + movq -5896(%rbp), %rax + pcmpeqd %xmm2, %xmm2 + leaq -4(%rax), %rdx + movq -5768(%rbp), %rax + movdqu (%rax,%rdx,4), %xmm7 + movaps %xmm7, -5696(%rbp) + movdqa -5696(%rbp), %xmm1 + pcmpeqd %xmm0, %xmm1 + pxor %xmm2, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + je .L483 + jmp .L766 +.L788: + movq %rbx, %rdx + jmp .L536 +.L537: + movdqu -16(%rdx,%rsi,4), %xmm1 + pcmpgtd %xmm0, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + jne .L768 +.L536: + movq %rsi, %rax + addq $4, %rsi + cmpq %rsi, -5896(%rbp) + jnb .L537 + movq -5896(%rbp), %rbx + cmpq %rax, %rbx + je .L538 + movq -5768(%rbp), %rax + movdqu -16(%rax,%rbx,4), %xmm6 + movaps %xmm6, -5696(%rbp) + movdqa -5696(%rbp), %xmm1 + pcmpgtd %xmm0, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + jne .L768 +.L538: + movl $3, -5948(%rbp) + pcmpeqd %xmm3, %xmm3 + paddd %xmm0, %xmm3 + jmp .L542 +.L612: + movl $2, -5948(%rbp) + jmp .L542 +.L780: + rep bsfl %eax, %eax + cltq + jmp .L503 +.L779: + cmpq $3, %rdx + jbe .L787 + movq %rdx, %r14 + leaq -4(%rdx), %rdx + movq %rcx, %rbx + movq (%rdi), %rcx + movq %rdx, %rax + movq %rdi, %r15 + shrq $2, %rax + movq %rcx, (%rbx) + addq $1, %rax + salq $4, %rax + movl %eax, %ecx + movq -8(%rdi,%rcx), %rsi + leaq 8(%rbx), %rdi + andq $-8, %rdi + movq %rsi, -8(%rbx,%rcx) + subq %rdi, %rbx + movq %r15, %rsi + movq %rbx, %rcx + subq %rbx, %rsi + addl %eax, %ecx + movq %rdx, %rax + andq $-4, %rax + shrl $3, %ecx + rep movsq + addq $4, %rax + movq %r14, %rcx + subq %rax, %rcx + je .L492 +.L489: + movq -5880(%rbp), %rbx + salq $2, %rax + leaq 0(,%rcx,4), %rdx + testq %rcx, %rcx + movl $4, %ecx + leaq (%rbx,%rax), %rdi + movq -5768(%rbp), %rbx + cmove %rcx, %rdx + leaq (%rbx,%rax), %rsi + call memcpy@PLT +.L492: + movq -5896(%rbp), %rbx + movl $32, %ecx + movl %ebx, %eax + subl $1, %eax + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rax + shrq $4, %rax + movq %rax, %rsi + movl $1, %eax + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + addq $4, %rdx + cmpq %rdx, %rbx + jnb .L491 + movdqa .LC4(%rip), %xmm0 + movq -5880(%rbp), %rcx + movq %rbx, %rax +.L490: + movups %xmm0, (%rcx,%rax,4) + addq $4, %rax + cmpq %rdx, %rax + jb .L490 +.L491: + movq -5880(%rbp), %rdi + call _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + xorl %eax, %eax + cmpq $3, -5896(%rbp) + jbe .L494 + movq -5896(%rbp), %r14 + movq -5880(%rbp), %r15 + movq -5768(%rbp), %rbx + leaq -4(%r14), %rdx + movq (%r15), %rcx + movq %rdx, %rax + leaq 8(%rbx), %rdi + shrq $2, %rax + movq %rcx, (%rbx) + andq $-8, %rdi + addq $1, %rax + salq $4, %rax + movl %eax, %ecx + movq -8(%r15,%rcx), %rsi + movq %rsi, -8(%rbx,%rcx) + subq %rdi, %rbx + movq %r15, %rsi + movq %rbx, %rcx + subq %rbx, %rsi + addl %eax, %ecx + movq %rdx, %rax + andq $-4, %rax + shrl $3, %ecx + rep movsq + addq $4, %rax + subq %rax, %r14 + movq %r14, -5896(%rbp) + je .L483 +.L494: + movq -5896(%rbp), %rbx + salq $2, %rax + movq -5768(%rbp), %rdi + movl $4, %ecx + movq -5880(%rbp), %rsi + addq %rax, %rdi + leaq 0(,%rbx,4), %rdx + testq %rbx, %rbx + cmove %rcx, %rdx + addq %rax, %rsi + call memcpy@PLT + jmp .L483 +.L783: + movdqa %xmm1, %xmm5 + pand %xmm4, %xmm1 + pandn %xmm2, %xmm5 + por %xmm5, %xmm1 + pcmpgtd %xmm0, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + jne .L768 + movdqa %xmm3, %xmm1 + movl $64, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L534: + movq -5768(%rbp), %rbx + leaq (%rcx,%rax,4), %rdx + addq $1, %rax + movdqu (%rbx,%rdx,4), %xmm2 + movdqa %xmm2, %xmm4 + pcmpgtd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pand %xmm4, %xmm2 + pandn %xmm1, %xmm5 + movdqa %xmm2, %xmm1 + por %xmm5, %xmm1 + cmpq $16, %rax + jne .L534 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm0, %xmm2 + movmskps %xmm2, %eax + testl %eax, %eax + jne .L768 + leaq 64(%rsi), %rax + cmpq %rax, -5896(%rbp) + jb .L788 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L534 +.L601: + xorl %ecx, %ecx + jmp .L529 +.L786: + pxor %xmm5, %xmm1 + movq -5768(%rbp), %rbx + movmskps %xmm1, %eax + rep bsfl %eax, %eax + cltq + movd (%rbx,%rax,4), %xmm6 + leaq 4(%r13), %rax + pshufd $0, %xmm6, %xmm1 + movdqa %xmm1, %xmm4 + movaps %xmm1, -64(%rbp) + cmpq %r15, %rax + ja .L523 +.L524: + movq -5768(%rbp), %rbx + movdqa -5696(%rbp), %xmm7 + movq %rax, %r13 + movups %xmm7, -16(%rbx,%rax,4) + addq $4, %rax + cmpq %r15, %rax + jbe .L524 +.L523: + subq %r13, %r15 + leaq 0(,%r13,4), %rdx + movq %r15, %xmm1 + pshufd $0, %xmm1, %xmm1 + pcmpgtd -5920(%rbp), %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L525 + movq -5768(%rbp), %rax + movl %r12d, (%rax,%r13,4) +.L525: + pshufd $85, %xmm1, %xmm5 + movd %xmm5, %eax + testl %eax, %eax + je .L526 + movq -5768(%rbp), %rax + pshufd $85, %xmm2, %xmm5 + movd %xmm5, 4(%rax,%rdx) +.L526: + movdqa %xmm1, %xmm5 + punpckhdq %xmm1, %xmm5 + movd %xmm5, %eax + testl %eax, %eax + je .L527 + movq -5768(%rbp), %rax + movdqa %xmm2, %xmm5 + punpckhdq %xmm2, %xmm5 + movd %xmm5, 8(%rax,%rdx) +.L527: + pshufd $255, %xmm1, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L520 + movq -5768(%rbp), %rax + pshufd $255, %xmm2, %xmm1 + movd %xmm1, 12(%rax,%rdx) + jmp .L520 +.L787: + movq %rdx, %rcx + xorl %eax, %eax + jmp .L489 + .cfi_endproc +.LFE18798: + .size _ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0, .-_ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + .section .text._ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0, @function +_ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0: +.LFB18800: + .cfi_startproc + leaq 8(%rsp), %r10 + .cfi_def_cfa 10, 0 + andq $-64, %rsp + pushq -8(%r10) + pushq %rbp + movq %rsp, %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + pushq %r15 + .cfi_escape 0x10,0xf,0x2,0x76,0x78 + movq %rdx, %r15 + pushq %r14 + pushq %r13 + .cfi_escape 0x10,0xe,0x2,0x76,0x70 + .cfi_escape 0x10,0xd,0x2,0x76,0x68 + movq %rdi, %r13 + pushq %r12 + .cfi_escape 0x10,0xc,0x2,0x76,0x60 + movq %rcx, %r12 + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x58,0x6 + pushq %rbx + addq $-128, %rsp + .cfi_escape 0x10,0x3,0x2,0x76,0x50 + movq %rsi, -128(%rbp) + movq %r9, -120(%rbp) + cmpq $256, %rdx + jbe .L956 + movq %rdi, %r11 + movq %rdi, -152(%rbp) + movq %r8, %rbx + shrq $2, %r11 + movq %r11, %rdi + andl $15, %edi + movq %rdi, -144(%rbp) + jne .L957 + movq %rdx, -136(%rbp) + movq %r13, %r11 +.L801: + movq 8(%rbx), %rdx + movq 16(%rbx), %r9 + movq %rdx, %rsi + leaq 1(%r9), %rdi + leaq (%rdx,%rdx,8), %rcx + xorq (%rbx), %rdi + shrq $11, %rsi + rorx $40, %rdx, %rax + leaq 2(%r9), %rdx + addq %rdi, %rax + xorq %rsi, %rcx + movq %rax, %r8 + rorx $40, %rax, %rsi + xorq %rdx, %rcx + shrq $11, %r8 + leaq (%rax,%rax,8), %rdx + leaq 3(%r9), %rax + addq %rcx, %rsi + xorq %r8, %rdx + movq %rsi, %r8 + xorq %rax, %rdx + leaq (%rsi,%rsi,8), %rax + rorx $40, %rsi, %r10 + shrq $11, %r8 + addq %rdx, %r10 + leaq 4(%r9), %rsi + addq $5, %r9 + xorq %r8, %rax + rorx $40, %r10, %r8 + movq %r9, 16(%rbx) + xorq %rsi, %rax + movq %r10, %rsi + shrq $11, %rsi + addq %rax, %r8 + movq %rsi, %r14 + leaq (%r10,%r10,8), %rsi + leaq (%r8,%r8,8), %r10 + xorq %r14, %rsi + movq %r8, %r14 + rorx $40, %r8, %r8 + shrq $11, %r14 + xorq %r9, %rsi + movabsq $68719476719, %r9 + xorq %r14, %r10 + addq %rsi, %r8 + movl %esi, %esi + vmovq %r10, %xmm7 + movq -136(%rbp), %r10 + vpinsrq $1, %r8, %xmm7, %xmm1 + movq %r10, %r14 + vmovdqu %xmm1, (%rbx) + shrq $4, %r14 + cmpq %r9, %r10 + movl $4294967295, %r9d + movq %r14, %r8 + leaq 192(%r12), %r14 + cmova %r9, %r8 + movl %ecx, %r9d + shrq $32, %rcx + imulq %r8, %r9 + imulq %r8, %rcx + imulq %r8, %rsi + shrq $32, %r9 + salq $6, %r9 + shrq $32, %rcx + vmovdqa32 (%r11,%r9), %zmm3 + movl %edi, %r9d + shrq $32, %rdi + imulq %r8, %r9 + salq $6, %rcx + shrq $32, %rsi + imulq %r8, %rdi + salq $6, %rsi + vmovdqa32 (%r11,%rsi), %zmm5 + shrq $32, %r9 + salq $6, %r9 + shrq $32, %rdi + vmovdqa32 (%r11,%r9), %zmm2 + salq $6, %rdi + vpminsd %zmm3, %zmm2, %zmm1 + vpmaxsd (%r11,%rdi), %zmm1, %zmm1 + movq %rdx, %rdi + movl %edx, %edx + shrq $32, %rdi + imulq %r8, %rdx + vpmaxsd %zmm3, %zmm2, %zmm2 + imulq %r8, %rdi + vpminsd %zmm2, %zmm1, %zmm1 + vmovdqa32 (%r11,%rcx), %zmm2 + vpbroadcastd %xmm1, %zmm0 + vmovdqa32 %zmm1, (%r12) + shrq $32, %rdx + vpxord %zmm0, %zmm1, %zmm1 + shrq $32, %rdi + salq $6, %rdx + salq $6, %rdi + vmovdqa32 (%r11,%rdi), %zmm4 + vpminsd %zmm4, %zmm2, %zmm3 + vpmaxsd (%r11,%rdx), %zmm3, %zmm3 + movl %eax, %edx + shrq $32, %rax + imulq %r8, %rdx + vpmaxsd %zmm4, %zmm2, %zmm2 + imulq %r8, %rax + vpminsd %zmm2, %zmm3, %zmm3 + vmovdqa32 %zmm3, 64(%r12) + vpxord %zmm0, %zmm3, %zmm3 + shrq $32, %rdx + vpord %zmm3, %zmm1, %zmm1 + salq $6, %rdx + shrq $32, %rax + vmovdqa32 (%r11,%rdx), %zmm4 + salq $6, %rax + vpminsd %zmm5, %zmm4, %zmm2 + vpmaxsd (%r11,%rax), %zmm2, %zmm2 + vpmaxsd %zmm5, %zmm4, %zmm4 + vpminsd %zmm4, %zmm2, %zmm2 + vmovdqa32 %zmm2, 128(%r12) + vpxord %zmm0, %zmm2, %zmm2 + vpord %zmm2, %zmm1, %zmm1 + vptestnmd %zmm1, %zmm1, %k0 + kortestw %k0, %k0 + jc .L803 + vpbroadcastq .LC10(%rip), %zmm0 + movl $4, %esi + movq %r12, %rdi + vmovdqu64 %zmm0, 192(%r12) + vmovdqu64 %zmm0, 256(%r12) + vzeroupper + call _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + vpbroadcastd (%r12), %zmm0 + vpbroadcastd 188(%r12), %zmm1 + vpternlogd $0xFF, %zmm2, %zmm2, %zmm2 + vpaddd %zmm2, %zmm1, %zmm2 + vpcmpd $0, %zmm2, %zmm0, %k0 + kortestw %k0, %k0 + jnc .L805 + leaq -112(%rbp), %rdx + movq %r14, %rcx + movq %r15, %rsi + movq %r13, %rdi + call _ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L951 +.L805: + movl 96(%r12), %ecx + movl $23, %eax + movl $24, %edx + cmpl %ecx, 92(%r12) + je .L845 + jmp .L850 + .p2align 4,,10 + .p2align 3 +.L848: + testq %rax, %rax + je .L958 +.L845: + movq %rax, %rdx + subq $1, %rax + movl (%r12,%rax,4), %esi + cmpl %esi, %ecx + je .L848 + cmpl (%r12,%rdx,4), %ecx + je .L850 + movl %esi, %ecx + jmp .L847 + .p2align 4,,10 + .p2align 3 +.L851: + cmpq $47, %rdx + je .L954 +.L850: + movq %rdx, %rsi + addq $1, %rdx + cmpl (%r12,%rdx,4), %ecx + je .L851 + movl $24, %edx + subq $23, %rsi + subq %rax, %rdx + cmpq %rdx, %rsi + jb .L847 +.L954: + movl (%r12,%rax,4), %ecx +.L847: + vpbroadcastd %ecx, %zmm1 +.L955: + movl $1, -136(%rbp) +.L843: + cmpq $0, -120(%rbp) + je .L959 + leaq -16(%r15), %rdx + vmovdqa32 %zmm1, %zmm0 + leaq 0(%r13,%rdx,4), %r10 + movq %rdx, %r9 + movq %rdx, %rcx + vmovdqu64 (%r10), %zmm6 + andl $63, %r9d + andl $48, %ecx + je .L854 + vmovdqu64 0(%r13), %zmm2 + movq $-1, %rdi + vpcmpd $6, %zmm1, %zmm2, %k1 + knotw %k1, %k2 + vpcompressd %zmm2, %zmm1{%k2}{z} + kmovw %k2, %eax + popcntq %rax, %rax + bzhi %rax, %rdi, %rcx + kmovw %ecx, %k7 + vmovdqu32 %zmm1, 0(%r13){%k7} + vpcompressd %zmm2, %zmm1{%k1}{z} + kmovw %k1, %ecx + leaq 0(%r13,%rax,4), %rax + vmovdqu64 %zmm1, (%r12) + popcntq %rcx, %rcx + testb $32, %dl + je .L855 + vmovdqu64 64(%r13), %zmm1 + vpcmpd $6, %zmm0, %zmm1, %k1 + knotw %k1, %k2 + vpcompressd %zmm1, %zmm2{%k2}{z} + kmovw %k2, %esi + popcntq %rsi, %rsi + bzhi %rsi, %rdi, %r8 + kmovw %r8d, %k4 + vmovdqu32 %zmm2, (%rax){%k4} + vpcompressd %zmm1, %zmm2{%k1}{z} + leaq (%rax,%rsi,4), %rax + vmovdqu64 %zmm2, (%r12,%rcx,4) + kmovw %k1, %esi + popcntq %rsi, %rsi + addq %rsi, %rcx + cmpq $47, %r9 + jbe .L855 + vmovdqu64 128(%r13), %zmm1 + vpcmpd $6, %zmm0, %zmm1, %k1 + knotw %k1, %k2 + vpcompressd %zmm1, %zmm2{%k2}{z} + kmovw %k2, %esi + popcntq %rsi, %rsi + bzhi %rsi, %rdi, %rdi + kmovw %edi, %k7 + vmovdqu32 %zmm2, (%rax){%k7} + vpcompressd %zmm1, %zmm2{%k1}{z} + leaq (%rax,%rsi,4), %rax + vmovdqu64 %zmm2, (%r12,%rcx,4) + kmovw %k1, %esi + popcntq %rsi, %rsi + addq %rsi, %rcx + leaq 1(%r9), %rsi + cmpq $17, %rsi + leaq 0(,%rcx,4), %r8 + sbbq %rsi, %rsi + andq $-32, %rsi + addq $48, %rsi + cmpq %rsi, %r9 + jne .L960 +.L857: + movq %rax, %r9 + movq %rdx, %rsi + subq %r13, %r9 + subq %rcx, %rsi + sarq $2, %r9 + leaq 0(%r13,%rsi,4), %r11 + subq %r9, %rdx + subq %r9, %rsi + movq %rdx, %r14 + leaq (%rax,%rsi,4), %r10 + movq %rsi, %rdx + leaq (%rax,%r14,4), %rdi + vmovq %rdi, %xmm8 + .p2align 4,,10 + .p2align 3 +.L859: + movl %r8d, %ecx + cmpl $8, %r8d + jnb .L861 + testb $4, %r8b + jne .L961 + testl %ecx, %ecx + jne .L962 +.L862: + movl %r8d, %ecx + cmpl $8, %r8d + jnb .L865 + andl $4, %r8d + jne .L963 + testl %ecx, %ecx + jne .L964 +.L866: + testq %rdx, %rdx + je .L905 +.L880: + leaq 256(%rax), %rsi + leaq -256(%r10), %rdi + vmovdqu64 (%rax), %zmm15 + vmovdqu64 64(%rax), %zmm14 + vmovdqu64 128(%rax), %zmm13 + vmovdqu64 192(%rax), %zmm12 + vmovdqu64 -128(%r10), %zmm9 + vmovdqu64 -64(%r10), %zmm7 + vmovdqu64 -256(%r10), %zmm11 + vmovdqu64 -192(%r10), %zmm10 + cmpq %rdi, %rsi + je .L906 + xorl %ecx, %ecx + movq $-1, %r8 + jmp .L873 + .p2align 4,,10 + .p2align 3 +.L966: + vmovdqu64 -128(%rdi), %zmm2 + vmovdqu64 -64(%rdi), %zmm1 + prefetcht0 -1024(%rdi) + subq $256, %rdi + vmovdqu64 (%rdi), %zmm4 + vmovdqu64 64(%rdi), %zmm3 +.L872: + vpcmpd $6, %zmm0, %zmm4, %k1 + knotw %k1, %k2 + vpcompressd %zmm4, %zmm5{%k2}{z} + kmovw %k2, %r10d + popcntq %r10, %r10 + vmovdqu64 %zmm5, (%rax,%rcx,4) + addq %rcx, %r10 + vpcompressd %zmm4, %zmm5{%k1}{z} + kmovw %k1, %ecx + vpcmpd $6, %zmm0, %zmm3, %k1 + leaq -16(%rdx,%r10), %r11 + popcntq %rcx, %rcx + bzhi %rcx, %r8, %rcx + kmovw %ecx, %k3 + vmovdqu32 %zmm5, (%rax,%r11,4){%k3} + knotw %k1, %k2 + vpcompressd %zmm3, %zmm4{%k2}{z} + kmovw %k2, %ecx + popcntq %rcx, %rcx + vmovdqu64 %zmm4, (%rax,%r10,4) + addq %r10, %rcx + vpcompressd %zmm3, %zmm4{%k1}{z} + kmovw %k1, %r10d + vpcmpd $6, %zmm0, %zmm2, %k1 + leaq -32(%rdx,%rcx), %r11 + popcntq %r10, %r10 + bzhi %r10, %r8, %r10 + kmovw %r10d, %k5 + vmovdqu32 %zmm4, (%rax,%r11,4){%k5} + knotw %k1, %k2 + vpcompressd %zmm2, %zmm3{%k2}{z} + kmovw %k2, %r10d + popcntq %r10, %r10 + vmovdqu64 %zmm3, (%rax,%rcx,4) + addq %rcx, %r10 + vpcompressd %zmm2, %zmm3{%k1}{z} + kmovw %k1, %ecx + vpcmpd $6, %zmm0, %zmm1, %k1 + leaq -48(%rdx,%r10), %r11 + popcntq %rcx, %rcx + subq $64, %rdx + bzhi %rcx, %r8, %rcx + kmovw %ecx, %k6 + vmovdqu32 %zmm3, (%rax,%r11,4){%k6} + knotw %k1, %k2 + vpcompressd %zmm1, %zmm2{%k2}{z} + kmovw %k2, %ecx + popcntq %rcx, %rcx + addq %r10, %rcx + vmovdqu64 %zmm2, (%rax,%r10,4) + vpcompressd %zmm1, %zmm2{%k1}{z} + kmovw %k1, %r10d + leaq (%rdx,%rcx), %r11 + popcntq %r10, %r10 + bzhi %r10, %r8, %r10 + kmovw %r10d, %k7 + vmovdqu32 %zmm2, (%rax,%r11,4){%k7} + cmpq %rdi, %rsi + je .L965 +.L873: + movq %rsi, %r10 + subq %rax, %r10 + sarq $2, %r10 + subq %rcx, %r10 + cmpq $64, %r10 + ja .L966 + vmovdqu64 (%rsi), %zmm4 + vmovdqu64 64(%rsi), %zmm3 + prefetcht0 1024(%rsi) + addq $256, %rsi + vmovdqu64 -128(%rsi), %zmm2 + vmovdqu64 -64(%rsi), %zmm1 + jmp .L872 + .p2align 4,,10 + .p2align 3 +.L957: + movl $16, %eax + subq %rdi, %rax + leaq 0(%r13,%rax,4), %r11 + leaq -16(%rdi,%rdx), %rax + movq %rax, -136(%rbp) + jmp .L801 + .p2align 4,,10 + .p2align 3 +.L965: + leaq (%rax,%rcx,4), %rsi +.L870: + vpcmpd $6, %zmm0, %zmm15, %k1 + knotw %k1, %k2 + vpcompressd %zmm15, %zmm1{%k2}{z} + kmovw %k2, %edi + popcntq %rdi, %rdi + vmovdqu64 %zmm1, (%rsi) + vpcompressd %zmm15, %zmm1{%k1}{z} + kmovw %k1, %esi + addq %rcx, %rdi + vpcmpd $6, %zmm0, %zmm14, %k1 + leaq -16(%rdx,%rdi), %r8 + popcntq %rsi, %rsi + movq $-1, %rcx + bzhi %rsi, %rcx, %rsi + kmovw %esi, %k4 + vmovdqu32 %zmm1, (%rax,%r8,4){%k4} + knotw %k1, %k2 + vpcompressd %zmm14, %zmm1{%k2}{z} + kmovw %k2, %esi + popcntq %rsi, %rsi + vmovdqu64 %zmm1, (%rax,%rdi,4) + addq %rdi, %rsi + vpcompressd %zmm14, %zmm1{%k1}{z} + kmovw %k1, %edi + vpcmpd $6, %zmm0, %zmm13, %k1 + leaq -32(%rdx,%rsi), %r8 + popcntq %rdi, %rdi + bzhi %rdi, %rcx, %rdi + kmovw %edi, %k4 + vmovdqu32 %zmm1, (%rax,%r8,4){%k4} + knotw %k1, %k2 + vpcompressd %zmm13, %zmm1{%k2}{z} + kmovw %k2, %edi + popcntq %rdi, %rdi + vmovdqu64 %zmm1, (%rax,%rsi,4) + addq %rsi, %rdi + vpcompressd %zmm13, %zmm1{%k1}{z} + kmovw %k1, %esi + vpcmpd $6, %zmm0, %zmm12, %k1 + leaq -48(%rdx,%rdi), %r8 + popcntq %rsi, %rsi + bzhi %rsi, %rcx, %rsi + kmovw %esi, %k4 + vmovdqu32 %zmm1, (%rax,%r8,4){%k4} + knotw %k1, %k2 + vpcompressd %zmm12, %zmm1{%k2}{z} + kmovw %k2, %esi + popcntq %rsi, %rsi + vmovdqu64 %zmm1, (%rax,%rdi,4) + addq %rdi, %rsi + vpcompressd %zmm12, %zmm1{%k1}{z} + kmovw %k1, %edi + vpcmpd $6, %zmm0, %zmm11, %k1 + leaq -64(%rdx,%rsi), %r8 + popcntq %rdi, %rdi + bzhi %rdi, %rcx, %rdi + kmovw %edi, %k4 + vmovdqu32 %zmm1, (%rax,%r8,4){%k4} + knotw %k1, %k2 + vpcompressd %zmm11, %zmm1{%k2}{z} + kmovw %k2, %edi + popcntq %rdi, %rdi + vmovdqu64 %zmm1, (%rax,%rsi,4) + addq %rsi, %rdi + vpcompressd %zmm11, %zmm1{%k1}{z} + kmovw %k1, %esi + vpcmpd $6, %zmm0, %zmm10, %k1 + leaq -80(%rdx,%rdi), %r8 + popcntq %rsi, %rsi + bzhi %rsi, %rcx, %rsi + kmovw %esi, %k4 + vmovdqu32 %zmm1, (%rax,%r8,4){%k4} + knotw %k1, %k2 + vpcompressd %zmm10, %zmm1{%k2}{z} + kmovw %k2, %esi + popcntq %rsi, %rsi + vmovdqu64 %zmm1, (%rax,%rdi,4) + addq %rdi, %rsi + vpcompressd %zmm10, %zmm1{%k1}{z} + kmovw %k1, %edi + vpcmpd $6, %zmm0, %zmm9, %k1 + leaq -96(%rdx,%rsi), %r8 + popcntq %rdi, %rdi + bzhi %rdi, %rcx, %rdi + kmovw %edi, %k3 + vmovdqu32 %zmm1, (%rax,%r8,4){%k3} + knotw %k1, %k2 + vpcompressd %zmm9, %zmm1{%k2}{z} + kmovw %k2, %edi + popcntq %rdi, %rdi + vmovdqu64 %zmm1, (%rax,%rsi,4) + addq %rsi, %rdi + vpcompressd %zmm9, %zmm1{%k1}{z} + kmovw %k1, %esi + vpcmpd $6, %zmm0, %zmm7, %k1 + leaq -112(%rdx,%rdi), %r8 + popcntq %rsi, %rsi + bzhi %rsi, %rcx, %rsi + kmovw %esi, %k5 + leaq -128(%rdx), %rsi + vmovdqu32 %zmm1, (%rax,%r8,4){%k5} + knotw %k1, %k2 + vpcompressd %zmm7, %zmm1{%k2}{z} + kmovw %k2, %edx + popcntq %rdx, %rdx + addq %rdi, %rdx + vmovdqu64 %zmm1, (%rax,%rdi,4) + vpcompressd %zmm7, %zmm1{%k1}{z} + kmovw %k1, %edi + popcntq %rdi, %rdi + bzhi %rdi, %rcx, %rcx + addq %rdx, %rsi + kmovw %ecx, %k6 + movq %r14, %rcx + vmovdqu32 %zmm1, (%rax,%rsi,4){%k6} + leaq (%rax,%rdx,4), %rdi + subq %rdx, %rcx +.L869: + movq %rdi, %rsi + cmpq $15, %rcx + ja .L874 + leaq -64(%rax,%r14,4), %rsi +.L874: + vpcmpd $6, %zmm0, %zmm6, %k1 + vmovdqu64 (%rsi), %zmm7 + vmovq %xmm8, %rcx + movq $-1, %rsi + vmovdqu64 %zmm7, (%rcx) + knotw %k1, %k2 + vpcompressd %zmm6, %zmm0{%k2}{z} + kmovw %k2, %ecx + popcntq %rcx, %rcx + addq %rcx, %rdx + bzhi %rcx, %rsi, %r8 + kmovw %r8d, %k4 + kmovw %k1, %ecx + vmovdqu32 %zmm0, (%rdi){%k4} + vpcompressd %zmm6, %zmm0{%k1}{z} + popcntq %rcx, %rcx + leaq (%rdx,%r9), %r14 + bzhi %rcx, %rsi, %rsi + kmovw %esi, %k4 + vmovdqu32 %zmm0, (%rax,%rdx,4){%k4} + movq -120(%rbp), %r9 + subq $1, %r9 + cmpl $2, -136(%rbp) + je .L967 + movq -128(%rbp), %rsi + movq %rbx, %r8 + movq %r12, %rcx + movq %r14, %rdx + movq %r13, %rdi + movq %r9, -120(%rbp) + vzeroupper + call _ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + cmpl $3, -136(%rbp) + movq -120(%rbp), %r9 + je .L951 +.L876: + movq %r15, %rdx + movq -128(%rbp), %rsi + leaq 0(%r13,%r14,4), %rdi + movq %rbx, %r8 + subq %r14, %rdx + movq %r12, %rcx + call _ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L951: + subq $-128, %rsp + popq %rbx + popq %r10 + .cfi_remember_state + .cfi_def_cfa 10, 0 + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + leaq -8(%r10), %rsp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L865: + .cfi_restore_state + movq (%r12), %rcx + leaq 8(%r11), %rdi + andq $-8, %rdi + movq %rcx, (%r11) + movl %r8d, %ecx + movq -8(%r12,%rcx), %rsi + movq %rsi, -8(%r11,%rcx) + movq %r11, %rcx + movq %r12, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r8d, %ecx + shrl $3, %ecx + rep movsq + jmp .L866 + .p2align 4,,10 + .p2align 3 +.L861: + movq (%r11), %rcx + leaq 8(%rax), %rdi + andq $-8, %rdi + movq %rcx, (%rax) + movl %r8d, %ecx + movq -8(%r11,%rcx), %rsi + movq %rsi, -8(%rax,%rcx) + movq %rax, %rcx + movq %r11, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r8d, %ecx + shrl $3, %ecx + rep movsq + jmp .L862 + .p2align 4,,10 + .p2align 3 +.L958: + movl (%r12), %ecx + jmp .L847 + .p2align 4,,10 + .p2align 3 +.L960: + subq %rsi, %r9 + leaq 0(%r13,%rsi,4), %r11 + leaq (%r12,%r8), %rsi +.L879: + movq $-1, %rdi + bzhi %r9, %rdi, %rdi + movzwl %di, %edi + kmovd %edi, %k0 +.L860: + vmovdqu64 (%r11), %zmm1 + movq $-1, %r8 + vpcmpd $6, %zmm0, %zmm1, %k1 + kandnw %k0, %k1, %k2 + vpcompressd %zmm1, %zmm2{%k2}{z} + kmovw %k2, %edi + kandw %k0, %k1, %k1 + popcntq %rdi, %rdi + bzhi %rdi, %r8, %r8 + kmovw %r8d, %k4 + vmovdqu32 %zmm2, (%rax){%k4} + leaq (%rax,%rdi,4), %rax + kmovw %k1, %r8d + popcntq %r8, %r8 + addq %rcx, %r8 + movq %rax, %r9 + movq %rdx, %rcx + vpcompressd %zmm1, %zmm2{%k1}{z} + subq %r13, %r9 + subq %r8, %rcx + vmovdqu64 %zmm2, (%rsi) + salq $2, %r8 + sarq $2, %r9 + leaq 0(%r13,%rcx,4), %r11 + subq %r9, %rdx + movq %rdx, %r14 + movq %rcx, %rdx + subq %r9, %rdx + leaq (%rax,%r14,4), %rdi + leaq (%rax,%rdx,4), %r10 + vmovq %rdi, %xmm8 + jmp .L859 + .p2align 4,,10 + .p2align 3 +.L964: + movzbl (%r12), %esi + movb %sil, (%r11) + testb $2, %cl + je .L866 + movzwl -2(%r12,%rcx), %esi + movw %si, -2(%r11,%rcx) + jmp .L866 + .p2align 4,,10 + .p2align 3 +.L962: + movzbl (%r11), %esi + movb %sil, (%rax) + testb $2, %cl + je .L862 + movzwl -2(%r11,%rcx), %esi + movw %si, -2(%rax,%rcx) + jmp .L862 + .p2align 4,,10 + .p2align 3 +.L855: + leaq -16(%r9), %rsi + leaq 1(%r9), %rdi + andq $-16, %rsi + leaq 0(,%rcx,4), %r8 + addq $16, %rsi + cmpq $16, %rdi + movl $16, %edi + cmovbe %rdi, %rsi + cmpq %r9, %rsi + je .L857 + subq %rsi, %r9 + leaq 0(%r13,%rsi,4), %r11 + leaq (%r12,%r8), %rsi + cmpq $255, %r9 + jbe .L879 + movl $65535, %edi + kmovd %edi, %k0 + jmp .L860 + .p2align 4,,10 + .p2align 3 +.L956: + cmpq $1, %rdx + jbe .L951 + leaq 1024(%rdi), %rax + cmpq %rax, %rsi + jb .L968 + movl $16, %esi + call _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + jmp .L951 + .p2align 4,,10 + .p2align 3 +.L803: + vmovdqu32 0(%r13), %zmm6 + movl $16, %esi + movq $-1, %rax + subq -144(%rbp), %rsi + bzhi %rsi, %rax, %rax + kmovw %eax, %k6 + vpcmpd $4, %zmm0, %zmm6, %k0 + kandw %k6, %k0, %k0 + kmovw %k0, %eax + kortestw %k0, %k0 + jne .L969 + vpxor %xmm4, %xmm4, %xmm4 + leaq 1024(%r13,%rsi,4), %rdi + vmovdqa64 %zmm4, %zmm3 + .p2align 4,,10 + .p2align 3 +.L809: + movq %rsi, %rcx + addq $256, %rsi + cmpq %rsi, %r15 + jb .L813 + leaq -1024(%rdi), %rax +.L808: + vpxord (%rax), %zmm0, %zmm1 + vpxord 64(%rax), %zmm0, %zmm2 + leaq 128(%rax), %rdx + vpord %zmm3, %zmm1, %zmm3 + vpord %zmm4, %zmm2, %zmm4 + vpxord 128(%rax), %zmm0, %zmm1 + vpxord 192(%rax), %zmm0, %zmm2 + vpord %zmm3, %zmm1, %zmm3 + vpxord 256(%rax), %zmm0, %zmm1 + vpord %zmm4, %zmm2, %zmm4 + vpxord 320(%rax), %zmm0, %zmm2 + leaq 384(%rdx), %rax + vpord %zmm3, %zmm1, %zmm3 + vpxord 256(%rdx), %zmm0, %zmm1 + vpord %zmm4, %zmm2, %zmm4 + vpxord 320(%rdx), %zmm0, %zmm2 + vpord %zmm3, %zmm1, %zmm1 + vpord %zmm4, %zmm2, %zmm2 + vmovdqa64 %zmm1, %zmm3 + vmovdqa64 %zmm2, %zmm4 + cmpq %rax, %rdi + jne .L808 + vpord %zmm2, %zmm1, %zmm1 + leaq 1408(%rdx), %rdi + vptestnmd %zmm1, %zmm1, %k0 + kortestw %k0, %k0 + setc %al + testb %al, %al + jne .L809 + vmovdqa32 0(%r13,%rcx,4), %zmm6 + vpcmpd $4, %zmm0, %zmm6, %k0 + kortestw %k0, %k0 + jne .L811 + .p2align 4,,10 + .p2align 3 +.L810: + addq $16, %rcx + vmovdqa32 0(%r13,%rcx,4), %zmm6 + vpcmpd $4, %zmm0, %zmm6, %k0 + kortestw %k0, %k0 + je .L810 +.L811: + kmovw %k0, %eax + tzcntl %eax, %eax + addq %rcx, %rax +.L807: + vpbroadcastd 0(%r13,%rax,4), %zmm5 + vmovdqa64 %zmm0, %zmm2 + leaq 0(%r13,%rax,4), %rdi + vpcmpd $6, %zmm0, %zmm5, %k0 + vmovdqa64 %zmm5, %zmm1 + kortestw %k0, %k0 + jne .L816 + leaq -16(%r15), %rax + xorl %ecx, %ecx + jmp .L822 + .p2align 4,,10 + .p2align 3 +.L817: + kmovw %k0, %edx + popcntq %rdx, %rdx + addq %rdx, %rcx + leaq -16(%rax), %rdx + vmovdqu64 %zmm2, 0(%r13,%rax,4) + cmpq %rdx, %r15 + jbe .L970 + movq %rdx, %rax +.L822: + vmovdqu32 0(%r13,%rax,4), %zmm6 + vpcmpd $0, %zmm5, %zmm6, %k1 + vpcmpd $0, %zmm0, %zmm6, %k0 + kmovw %k1, %edx + kmovw %k0, %esi + korw %k0, %k1, %k1 + kortestw %k1, %k1 + jc .L817 + kmovw %edx, %k0 + kmovw %esi, %k5 + kxnorw %k5, %k0, %k0 + kmovw %k0, %edx + tzcntl %edx, %edx + leaq 16(%rax), %rsi + addq %rax, %rdx + addq $32, %rax + vpbroadcastd 0(%r13,%rdx,4), %zmm2 + movq %r15, %rdx + subq %rcx, %rdx + vmovdqa64 %zmm2, %zmm0 + vmovdqa64 %zmm2, -112(%rbp) + cmpq %rdx, %rax + ja .L818 + .p2align 4,,10 + .p2align 3 +.L819: + vmovdqu64 %zmm1, -64(%r13,%rax,4) + movq %rax, %rsi + addq $16, %rax + cmpq %rax, %rdx + jnb .L819 +.L818: + subq %rsi, %rdx + leaq 0(%r13,%rsi,4), %rcx + movl $65535, %eax + cmpq $255, %rdx + ja .L820 + movq $-1, %rax + bzhi %rdx, %rax, %rax + movzwl %ax, %eax +.L820: + kmovw %eax, %k3 + vmovdqu32 %zmm5, (%rcx){%k3} +.L821: + vpbroadcastd (%r12), %zmm3 + vpcmpd $0, .LC8(%rip), %zmm3, %k0 + vmovdqa64 %zmm3, %zmm1 + kortestw %k0, %k0 + jc .L901 + vpcmpd $0, .LC9(%rip), %zmm3, %k0 + kortestw %k0, %k0 + jc .L839 + vpminsd %zmm0, %zmm5, %zmm2 + vpcmpd $6, %zmm2, %zmm3, %k0 + kortestw %k0, %k0 + jne .L971 + vmovdqa64 %zmm3, %zmm2 + movl $256, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L834: + movq %rax, %rdx + addq $1, %rax + salq $4, %rdx + addq %rcx, %rdx + vpminsd 0(%r13,%rdx,4), %zmm2, %zmm0 + vmovdqa64 %zmm0, %zmm2 + cmpq $16, %rax + jne .L834 + vpcmpd $6, %zmm0, %zmm3, %k0 + kortestw %k0, %k0 + jne .L955 + leaq 256(%rsi), %rax + cmpq %rax, %r15 + jb .L841 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L834 + .p2align 4,,10 + .p2align 3 +.L813: + movq %rcx, %rdx + addq $16, %rcx + cmpq %rcx, %r15 + jb .L972 + vmovdqa32 -64(%r13,%rcx,4), %zmm6 + vpcmpd $4, %zmm0, %zmm6, %k0 + kmovw %k0, %eax + kortestw %k0, %k0 + je .L813 +.L953: + tzcntl %eax, %eax + addq %rdx, %rax + jmp .L807 + .p2align 4,,10 + .p2align 3 +.L854: + movq %r12, %rsi + movq %r13, %r11 + movq %r13, %rax + movq %rdx, %r14 + vmovq %r10, %xmm8 + testq %r9, %r9 + jne .L879 + jmp .L880 +.L967: + vzeroupper + jmp .L876 +.L906: + movq %rax, %rsi + xorl %ecx, %ecx + jmp .L870 +.L905: + movq %rax, %rdi + movq %r14, %rcx + jmp .L869 +.L959: + leaq -1(%r15), %rbx + movq %rbx, %r12 + shrq %r12 + .p2align 4,,10 + .p2align 3 +.L852: + movq %r12, %rdx + movq %r15, %rsi + movq %r13, %rdi + call _ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + subq $1, %r12 + jnb .L852 + .p2align 4,,10 + .p2align 3 +.L853: + movl 0(%r13,%rbx,4), %edx + movl 0(%r13), %eax + movq %rbx, %rsi + movq %r13, %rdi + movl %edx, 0(%r13) + xorl %edx, %edx + movl %eax, 0(%r13,%rbx,4) + call _ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + subq $1, %rbx + jne .L853 + jmp .L951 + .p2align 4,,10 + .p2align 3 +.L842: + vpcmpd $6, -64(%r13,%rsi,4), %zmm3, %k0 + kortestw %k0, %k0 + jne .L955 +.L841: + movq %rsi, %rax + addq $16, %rsi + cmpq %rsi, %r15 + jnb .L842 + cmpq %rax, %r15 + je .L901 + vpcmpd $6, -64(%r13,%r15,4), %zmm3, %k0 + xorl %eax, %eax + kortestw %k0, %k0 + sete %al + addl $1, %eax + movl %eax, -136(%rbp) + jmp .L843 +.L961: + movl (%r11), %esi + movl %esi, (%rax) + movl -4(%r11,%rcx), %esi + movl %esi, -4(%rax,%rcx) + jmp .L862 +.L963: + movl (%r12), %esi + movl %esi, (%r11) + movl -4(%r12,%rcx), %esi + movl %esi, -4(%r11,%rcx) + jmp .L866 +.L816: + movq %r15, %rsi + leaq -112(%rbp), %rdx + movq %r12, %rcx + subq %rax, %rsi + call _ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L951 + vmovdqa64 -112(%rbp), %zmm0 + jmp .L821 +.L970: + movl $65535, %edi + vmovdqu64 0(%r13), %zmm2 + kmovd %edi, %k2 + cmpq $255, %rax + ja .L823 + movq $-1, %rdx + bzhi %rax, %rdx, %rdx + movzwl %dx, %edi + kmovd %edi, %k2 +.L823: + vpcmpd $0, %zmm5, %zmm2, %k0 + vpcmpd $0, %zmm0, %zmm2, %k1 + movq %r15, %rdx + kandw %k2, %k1, %k1 + knotw %k2, %k2 + korw %k1, %k0, %k0 + korw %k2, %k0, %k0 + kortestw %k0, %k0 + setc %sil + subq %rcx, %rdx + testb %sil, %sil + je .L973 + xorl %ecx, %ecx + kmovw %k1, %eax + vmovdqu64 %zmm0, 0(%r13) + popcntq %rax, %rcx + movq %rdx, %rax + subq %rcx, %rax + cmpq $15, %rax + jbe .L828 + leaq -16(%rax), %rcx + movq -152(%rbp), %rsi + movq %rcx, %rdx + shrq $4, %rdx + salq $6, %rdx + leaq 64(%r13,%rdx), %rdx + .p2align 4,,10 + .p2align 3 +.L829: + vmovdqu64 %zmm1, (%rsi) + addq $64, %rsi + cmpq %rsi, %rdx + jne .L829 + andq $-16, %rcx + movl $65535, %ebx + vmovdqa64 %zmm5, (%r12) + leaq 16(%rcx), %rdx + kmovd %ebx, %k1 + subq %rdx, %rax + leaq 0(%r13,%rdx,4), %r13 + cmpq $255, %rax + jbe .L881 +.L830: + vmovdqu32 (%r12), %zmm0{%k1}{z} + vmovdqu32 %zmm0, 0(%r13){%k1} + vzeroupper + jmp .L951 +.L838: + vmovdqu32 -64(%r13,%rsi,4), %zmm6 + vpcmpd $6, %zmm3, %zmm6, %k0 + kortestw %k0, %k0 + jne .L955 +.L837: + movq %rsi, %rax + addq $16, %rsi + cmpq %rsi, %r15 + jnb .L838 + cmpq %rax, %r15 + je .L839 + vmovdqu32 -64(%r13,%r15,4), %zmm6 + vpcmpd $6, %zmm3, %zmm6, %k0 + kortestw %k0, %k0 + jne .L955 +.L839: + movl $3, -136(%rbp) + vpternlogd $0xFF, %zmm1, %zmm1, %zmm1 + vpaddd %zmm1, %zmm3, %zmm1 + jmp .L843 +.L972: + leaq -16(%r15), %rdx + vmovdqu32 0(%r13,%rdx,4), %zmm6 + vpcmpd $4, %zmm0, %zmm6, %k0 + kmovw %k0, %eax + kortestw %k0, %k0 + jne .L953 + vzeroupper + jmp .L951 +.L969: + tzcntl %eax, %eax + jmp .L807 +.L968: + xorl %eax, %eax + cmpq $15, %rdx + jbe .L795 + leaq -16(%rdx), %rdx + movq (%rdi), %rcx + movq %rdx, %rax + shrq $4, %rax + movq %rcx, (%r12) + addq $1, %rax + salq $6, %rax + movl %eax, %ecx + movq -8(%rdi,%rcx), %rsi + leaq 8(%r12), %rdi + andq $-8, %rdi + movq %rsi, -8(%r12,%rcx) + movq %r12, %rcx + movq %r13, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %eax, %ecx + movq %rdx, %rax + shrl $3, %ecx + andq $-16, %rax + rep movsq + addq $16, %rax +.L795: + movq %r15, %rdx + leaq 0(,%rax,4), %rbx + subq %rax, %rdx + movl $65535, %eax + leaq (%r12,%rbx), %r14 + addq %r13, %rbx + kmovd %eax, %k4 + cmpq $255, %rdx + ja .L796 + movq $-1, %rax + bzhi %rdx, %rax, %rax + movzwl %ax, %eax + kmovd %eax, %k4 +.L796: + leal -1(%r15), %eax + movl $32, %edx + movl $1, %esi + vmovdqu32 (%rbx), %zmm0{%k4}{z} + bsrl %eax, %eax + xorl $31, %eax + vmovdqu32 %zmm0, (%r14){%k4} + vpbroadcastq .LC10(%rip), %zmm0 + subl %eax, %edx + movl $1, %eax + shlx %rdx, %rsi, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %r15, %rax + leaq 1(%rsi), %rdx + salq $4, %rdx + cmpq %rdx, %r15 + jnb .L800 +.L797: + vmovdqu64 %zmm0, (%r12,%rax,4) + addq $16, %rax + cmpq %rdx, %rax + jb .L797 +.L800: + movq %r12, %rdi + vzeroupper + call _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + cmpq $15, %r15 + jbe .L799 + leaq -16(%r15), %rax + movq (%r12), %rdx + leaq 8(%r13), %rdi + movq %r12, %rsi + shrq $4, %rax + andq $-8, %rdi + addq $1, %rax + movq %rdx, 0(%r13) + salq $6, %rax + movl %eax, %edx + movq -8(%r12,%rdx), %rcx + movq %rcx, -8(%r13,%rdx) + subq %rdi, %r13 + leal (%rax,%r13), %ecx + subq %r13, %rsi + shrl $3, %ecx + rep movsq +.L799: + vmovdqu32 (%r14), %zmm0{%k4}{z} + vmovdqu32 %zmm0, (%rbx){%k4} + vzeroupper + jmp .L951 +.L828: + vmovdqa64 %zmm5, (%r12) +.L881: + movq $-1, %rdx + bzhi %rax, %rdx, %rax + movzwl %ax, %eax + kmovd %eax, %k1 + jmp .L830 +.L901: + movl $2, -136(%rbp) + jmp .L843 +.L971: + vpmaxsd %zmm0, %zmm5, %zmm5 + vpcmpd $6, %zmm3, %zmm5, %k0 + kortestw %k0, %k0 + jne .L955 + vmovdqa64 %zmm3, %zmm2 + movl $256, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L835: + movq %rax, %rdx + addq $1, %rax + salq $4, %rdx + addq %rcx, %rdx + vpmaxsd 0(%r13,%rdx,4), %zmm2, %zmm0 + vmovdqa64 %zmm0, %zmm2 + cmpq $16, %rax + jne .L835 + vpcmpd $6, %zmm3, %zmm0, %k0 + kortestw %k0, %k0 + jne .L955 + leaq 256(%rsi), %rax + cmpq %rax, %r15 + jb .L837 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L835 +.L973: + knotw %k0, %k0 + kmovw %k0, %ecx + tzcntl %ecx, %ecx + vpbroadcastd 0(%r13,%rcx,4), %zmm2 + leaq 16(%rax), %rcx + vmovdqa64 %zmm2, %zmm0 + vmovdqa64 %zmm2, -112(%rbp) + cmpq %rdx, %rcx + ja .L825 +.L826: + vmovdqu64 %zmm1, -64(%r13,%rcx,4) + movq %rcx, %rax + addq $16, %rcx + cmpq %rdx, %rcx + jbe .L826 +.L825: + subq %rax, %rdx + leaq 0(%r13,%rax,4), %rsi + movl $-1, %ecx + cmpq $255, %rdx + ja .L827 + orq $-1, %rcx + bzhi %rdx, %rcx, %rcx +.L827: + kmovw %ecx, %k5 + vmovdqu32 %zmm5, (%rsi){%k5} + jmp .L821 + .cfi_endproc +.LFE18800: + .size _ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0, .-_ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + .section .text._ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0, @function +_ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0: +.LFB18802: + .cfi_startproc + leaq 8(%rsp), %r10 + .cfi_def_cfa 10, 0 + andq $-64, %rsp + pushq -8(%r10) + pushq %rbp + movq %rsp, %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + pushq %r15 + pushq %r14 + pushq %r13 + .cfi_escape 0x10,0xf,0x2,0x76,0x78 + .cfi_escape 0x10,0xe,0x2,0x76,0x70 + .cfi_escape 0x10,0xd,0x2,0x76,0x68 + movq %rdi, %r13 + pushq %r12 + .cfi_escape 0x10,0xc,0x2,0x76,0x60 + movq %rcx, %r12 + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x58,0x6 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x50 + movq %rdx, %rbx + addq $-128, %rsp + movq %rsi, -128(%rbp) + movq %r9, -120(%rbp) + cmpq $256, %rdx + jbe .L1141 + movq %rdi, %rax + movq %rdi, -152(%rbp) + movq %r8, %r14 + shrq $2, %rax + movq %rax, %rdi + andl $15, %edi + movq %rdi, -144(%rbp) + jne .L1142 + movq %rdx, -136(%rbp) + movq %r13, %r11 +.L986: + movq 8(%r14), %rdx + movq 16(%r14), %r9 + movq %rdx, %rsi + leaq 1(%r9), %rdi + leaq (%rdx,%rdx,8), %rcx + xorq (%r14), %rdi + shrq $11, %rsi + rorx $40, %rdx, %rax + leaq 2(%r9), %rdx + addq %rdi, %rax + xorq %rsi, %rcx + movq %rax, %r8 + rorx $40, %rax, %rsi + xorq %rdx, %rcx + shrq $11, %r8 + leaq (%rax,%rax,8), %rdx + leaq 3(%r9), %rax + addq %rcx, %rsi + xorq %r8, %rdx + movq %rsi, %r8 + xorq %rax, %rdx + leaq (%rsi,%rsi,8), %rax + rorx $40, %rsi, %r10 + shrq $11, %r8 + addq %rdx, %r10 + leaq 4(%r9), %rsi + addq $5, %r9 + xorq %r8, %rax + movq %r10, %r15 + rorx $40, %r10, %r8 + movq %r9, 16(%r14) + xorq %rsi, %rax + shrq $11, %r15 + leaq (%r10,%r10,8), %rsi + addq %rax, %r8 + xorq %r15, %rsi + movq %r8, %r15 + leaq (%r8,%r8,8), %r10 + xorq %r9, %rsi + rorx $40, %r8, %r8 + shrq $11, %r15 + addq %rsi, %r8 + movl %esi, %esi + movabsq $68719476719, %r9 + xorq %r15, %r10 + movq -136(%rbp), %r15 + vmovq %r10, %xmm6 + vpinsrq $1, %r8, %xmm6, %xmm1 + movq %r15, %r8 + shrq $4, %r8 + cmpq %r9, %r15 + movl $4294967295, %r9d + vmovdqu %xmm1, (%r14) + cmova %r9, %r8 + movl %ecx, %r9d + shrq $32, %rcx + leaq 192(%r12), %r15 + imulq %r8, %r9 + imulq %r8, %rcx + imulq %r8, %rsi + shrq $32, %r9 + salq $6, %r9 + shrq $32, %rcx + vmovdqa32 (%r11,%r9), %zmm3 + movl %edi, %r9d + shrq $32, %rdi + imulq %r8, %r9 + salq $6, %rcx + shrq $32, %rsi + imulq %r8, %rdi + salq $6, %rsi + vmovdqa32 (%r11,%rsi), %zmm5 + shrq $32, %r9 + salq $6, %r9 + shrq $32, %rdi + vmovdqa32 (%r11,%r9), %zmm2 + salq $6, %rdi + vpminsd %zmm3, %zmm2, %zmm1 + vpmaxsd (%r11,%rdi), %zmm1, %zmm1 + movq %rdx, %rdi + movl %edx, %edx + shrq $32, %rdi + imulq %r8, %rdx + vpmaxsd %zmm3, %zmm2, %zmm2 + imulq %r8, %rdi + vpminsd %zmm2, %zmm1, %zmm1 + vmovdqa32 (%r11,%rcx), %zmm2 + vpbroadcastd %xmm1, %zmm0 + vmovdqa32 %zmm1, (%r12) + shrq $32, %rdx + vpxord %zmm0, %zmm1, %zmm1 + shrq $32, %rdi + salq $6, %rdx + salq $6, %rdi + vmovdqa32 (%r11,%rdi), %zmm4 + vpminsd %zmm4, %zmm2, %zmm3 + vpmaxsd (%r11,%rdx), %zmm3, %zmm3 + movl %eax, %edx + shrq $32, %rax + imulq %r8, %rdx + vpmaxsd %zmm4, %zmm2, %zmm2 + imulq %r8, %rax + vpminsd %zmm2, %zmm3, %zmm3 + vmovdqa32 %zmm3, 64(%r12) + vpxord %zmm0, %zmm3, %zmm3 + shrq $32, %rdx + vpord %zmm3, %zmm1, %zmm1 + salq $6, %rdx + shrq $32, %rax + vmovdqa32 (%r11,%rdx), %zmm4 + salq $6, %rax + vpminsd %zmm5, %zmm4, %zmm2 + vpmaxsd (%r11,%rax), %zmm2, %zmm2 + vpmaxsd %zmm5, %zmm4, %zmm4 + vpminsd %zmm4, %zmm2, %zmm2 + vmovdqa32 %zmm2, 128(%r12) + vpxord %zmm0, %zmm2, %zmm2 + vpord %zmm2, %zmm1, %zmm1 + vptestnmd %zmm1, %zmm1, %k0 + kortestw %k0, %k0 + jc .L988 + vpbroadcastq .LC10(%rip), %zmm0 + movl $4, %esi + movq %r12, %rdi + vmovdqu64 %zmm0, 192(%r12) + vmovdqu64 %zmm0, 256(%r12) + vzeroupper + call _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + vpbroadcastd (%r12), %zmm0 + vpbroadcastd 188(%r12), %zmm1 + vpternlogd $0xFF, %zmm2, %zmm2, %zmm2 + vpaddd %zmm2, %zmm1, %zmm2 + vpcmpd $0, %zmm2, %zmm0, %k0 + kortestw %k0, %k0 + jnc .L990 + leaq -112(%rbp), %rdx + movq %r15, %rcx + movq %rbx, %rsi + movq %r13, %rdi + call _ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L1136 +.L990: + movl 96(%r12), %ecx + movl $23, %eax + movl $24, %edx + cmpl %ecx, 92(%r12) + je .L1030 + jmp .L1035 + .p2align 4,,10 + .p2align 3 +.L1033: + testq %rax, %rax + je .L1143 +.L1030: + movq %rax, %rdx + subq $1, %rax + movl (%r12,%rax,4), %esi + cmpl %esi, %ecx + je .L1033 + cmpl (%r12,%rdx,4), %ecx + je .L1035 + movl %esi, %ecx + jmp .L1032 + .p2align 4,,10 + .p2align 3 +.L1036: + cmpq $47, %rdx + je .L1139 +.L1035: + movq %rdx, %rsi + addq $1, %rdx + cmpl (%r12,%rdx,4), %ecx + je .L1036 + movl $24, %edx + subq $23, %rsi + subq %rax, %rdx + cmpq %rdx, %rsi + jb .L1032 +.L1139: + movl (%r12,%rax,4), %ecx +.L1032: + vpbroadcastd %ecx, %zmm1 +.L1140: + movl $1, -136(%rbp) +.L1028: + cmpq $0, -120(%rbp) + je .L1144 + leaq -16(%rbx), %rdx + vmovdqa32 %zmm1, %zmm0 + leaq 0(%r13,%rdx,4), %r10 + movq %rdx, %r9 + movq %rdx, %rcx + vmovdqu64 (%r10), %zmm13 + andl $63, %r9d + andl $48, %ecx + je .L1039 + vmovdqu64 0(%r13), %zmm2 + vpcmpd $6, %zmm1, %zmm2, %k2 + knotw %k2, %k1 + vpcompressd %zmm2, 0(%r13){%k1} + kmovw %k1, %eax + kmovw %k2, %ecx + popcntq %rax, %rax + popcntq %rcx, %rcx + leaq 0(%r13,%rax,4), %rax + vpcompressd %zmm2, (%r12){%k2} + testb $32, %dl + je .L1040 + vmovdqu64 64(%r13), %zmm1 + vpcmpd $6, %zmm0, %zmm1, %k1 + knotw %k1, %k2 + vpcompressd %zmm1, (%rax){%k2} + kmovw %k2, %esi + popcntq %rsi, %rsi + vpcompressd %zmm1, (%r12,%rcx,4){%k1} + leaq (%rax,%rsi,4), %rax + kmovw %k1, %esi + popcntq %rsi, %rsi + addq %rsi, %rcx + cmpq $47, %r9 + jbe .L1040 + vmovdqu64 128(%r13), %zmm1 + vpcmpd $6, %zmm0, %zmm1, %k1 + knotw %k1, %k2 + vpcompressd %zmm1, (%rax){%k2} + kmovw %k2, %esi + popcntq %rsi, %rsi + vpcompressd %zmm1, (%r12,%rcx,4){%k1} + leaq (%rax,%rsi,4), %rax + kmovw %k1, %esi + popcntq %rsi, %rsi + addq %rsi, %rcx + leaq 1(%r9), %rsi + cmpq $17, %rsi + leaq 0(,%rcx,4), %r8 + sbbq %rsi, %rsi + andq $-32, %rsi + addq $48, %rsi + cmpq %rsi, %r9 + jne .L1145 +.L1042: + movq %rax, %r9 + movq %rdx, %rsi + subq %r13, %r9 + subq %rcx, %rsi + sarq $2, %r9 + leaq 0(%r13,%rsi,4), %r11 + subq %r9, %rdx + subq %r9, %rsi + movq %rdx, %r15 + leaq (%rax,%rsi,4), %r10 + movq %rsi, %rdx + leaq (%rax,%r15,4), %rdi + vmovq %rdi, %xmm14 + .p2align 4,,10 + .p2align 3 +.L1044: + movl %r8d, %ecx + cmpl $8, %r8d + jnb .L1046 + testb $4, %r8b + jne .L1146 + testl %ecx, %ecx + jne .L1147 +.L1047: + movl %r8d, %ecx + cmpl $8, %r8d + jnb .L1050 + andl $4, %r8d + jne .L1148 + testl %ecx, %ecx + jne .L1149 +.L1051: + testq %rdx, %rdx + je .L1090 +.L1065: + leaq 256(%rax), %rsi + leaq -256(%r10), %rdi + vmovdqu64 (%rax), %zmm12 + vmovdqu64 64(%rax), %zmm11 + vmovdqu64 128(%rax), %zmm10 + vmovdqu64 192(%rax), %zmm9 + vmovdqu64 -128(%r10), %zmm6 + vmovdqu64 -64(%r10), %zmm5 + vmovdqu64 -256(%r10), %zmm8 + vmovdqu64 -192(%r10), %zmm7 + cmpq %rdi, %rsi + je .L1091 + xorl %ecx, %ecx + jmp .L1058 + .p2align 4,,10 + .p2align 3 +.L1151: + vmovdqu64 -128(%rdi), %zmm2 + vmovdqu64 -64(%rdi), %zmm1 + prefetcht0 -1024(%rdi) + subq $256, %rdi + vmovdqu64 (%rdi), %zmm4 + vmovdqu64 64(%rdi), %zmm3 +.L1057: + vpcmpd $6, %zmm0, %zmm4, %k2 + knotw %k2, %k1 + kmovw %k1, %r8d + popcntq %r8, %r8 + addq %rcx, %r8 + vpcompressd %zmm4, (%rax,%rcx,4){%k1} + leaq -16(%rdx,%r8), %rcx + vpcompressd %zmm4, (%rax,%rcx,4){%k2} + vpcmpd $6, %zmm0, %zmm3, %k2 + knotw %k2, %k1 + kmovw %k1, %ecx + popcntq %rcx, %rcx + addq %r8, %rcx + vpcompressd %zmm3, (%rax,%r8,4){%k1} + leaq -32(%rdx,%rcx), %r8 + vpcompressd %zmm3, (%rax,%r8,4){%k2} + vpcmpd $6, %zmm0, %zmm2, %k2 + knotw %k2, %k1 + kmovw %k1, %r8d + popcntq %r8, %r8 + addq %rcx, %r8 + vpcompressd %zmm2, (%rax,%rcx,4){%k1} + leaq -48(%rdx,%r8), %rcx + subq $64, %rdx + vpcompressd %zmm2, (%rax,%rcx,4){%k2} + vpcmpd $6, %zmm0, %zmm1, %k2 + knotw %k2, %k1 + kmovw %k1, %ecx + popcntq %rcx, %rcx + addq %r8, %rcx + vpcompressd %zmm1, (%rax,%r8,4){%k1} + leaq (%rdx,%rcx), %r8 + vpcompressd %zmm1, (%rax,%r8,4){%k2} + cmpq %rdi, %rsi + je .L1150 +.L1058: + movq %rsi, %r8 + subq %rax, %r8 + sarq $2, %r8 + subq %rcx, %r8 + cmpq $64, %r8 + ja .L1151 + vmovdqu64 (%rsi), %zmm4 + vmovdqu64 64(%rsi), %zmm3 + prefetcht0 1024(%rsi) + addq $256, %rsi + vmovdqu64 -128(%rsi), %zmm2 + vmovdqu64 -64(%rsi), %zmm1 + jmp .L1057 + .p2align 4,,10 + .p2align 3 +.L1142: + movl $16, %eax + subq %rdi, %rax + leaq 0(%r13,%rax,4), %r11 + leaq -16(%rdi,%rdx), %rax + movq %rax, -136(%rbp) + jmp .L986 + .p2align 4,,10 + .p2align 3 +.L1150: + leaq (%rax,%rcx,4), %rsi +.L1055: + vpcmpd $6, %zmm0, %zmm12, %k1 + movq %r15, %rdi + knotw %k1, %k2 + vpcompressd %zmm12, (%rsi){%k2} + kmovw %k2, %esi + popcntq %rsi, %rsi + addq %rsi, %rcx + leaq -16(%rdx,%rcx), %rsi + vpcompressd %zmm12, (%rax,%rsi,4){%k1} + vpcmpd $6, %zmm0, %zmm11, %k1 + knotw %k1, %k2 + kmovw %k2, %esi + popcntq %rsi, %rsi + addq %rcx, %rsi + vpcompressd %zmm11, (%rax,%rcx,4){%k2} + leaq -32(%rdx,%rsi), %rcx + vpcompressd %zmm11, (%rax,%rcx,4){%k1} + vpcmpd $6, %zmm0, %zmm10, %k1 + knotw %k1, %k2 + kmovw %k2, %ecx + popcntq %rcx, %rcx + addq %rsi, %rcx + vpcompressd %zmm10, (%rax,%rsi,4){%k2} + leaq -48(%rdx,%rcx), %rsi + vpcompressd %zmm10, (%rax,%rsi,4){%k1} + vpcmpd $6, %zmm0, %zmm9, %k1 + knotw %k1, %k2 + kmovw %k2, %esi + popcntq %rsi, %rsi + addq %rcx, %rsi + vpcompressd %zmm9, (%rax,%rcx,4){%k2} + leaq -64(%rdx,%rsi), %rcx + vpcompressd %zmm9, (%rax,%rcx,4){%k1} + vpcmpd $6, %zmm0, %zmm8, %k1 + knotw %k1, %k2 + kmovw %k2, %ecx + popcntq %rcx, %rcx + addq %rsi, %rcx + vpcompressd %zmm8, (%rax,%rsi,4){%k2} + leaq -80(%rdx,%rcx), %rsi + vpcompressd %zmm8, (%rax,%rsi,4){%k1} + vpcmpd $6, %zmm0, %zmm7, %k1 + knotw %k1, %k2 + kmovw %k2, %esi + popcntq %rsi, %rsi + addq %rcx, %rsi + vpcompressd %zmm7, (%rax,%rcx,4){%k2} + leaq -96(%rdx,%rsi), %rcx + vpcompressd %zmm7, (%rax,%rcx,4){%k1} + vpcmpd $6, %zmm0, %zmm6, %k1 + knotw %k1, %k2 + kmovw %k2, %ecx + popcntq %rcx, %rcx + addq %rsi, %rcx + vpcompressd %zmm6, (%rax,%rsi,4){%k2} + leaq -112(%rdx,%rcx), %rsi + vpcompressd %zmm6, (%rax,%rsi,4){%k1} + vpcmpd $6, %zmm0, %zmm5, %k1 + leaq -128(%rdx), %rsi + knotw %k1, %k2 + kmovw %k2, %edx + popcntq %rdx, %rdx + addq %rcx, %rdx + vpcompressd %zmm5, (%rax,%rcx,4){%k2} + leaq (%rsi,%rdx), %rcx + subq %rdx, %rdi + vpcompressd %zmm5, (%rax,%rcx,4){%k1} + leaq (%rax,%rdx,4), %rcx +.L1054: + movq %rcx, %rsi + cmpq $15, %rdi + ja .L1059 + leaq -64(%rax,%r15,4), %rsi +.L1059: + vmovdqu64 (%rsi), %zmm7 + vpcmpd $6, %zmm0, %zmm13, %k2 + vmovq %xmm14, %rdi + vmovdqu64 %zmm7, (%rdi) + knotw %k2, %k1 + vpcompressd %zmm13, (%rcx){%k1} + kmovw %k1, %ecx + popcntq %rcx, %rcx + addq %rcx, %rdx + vpcompressd %zmm13, (%rax,%rdx,4){%k2} + leaq (%rdx,%r9), %r15 + movq -120(%rbp), %r9 + subq $1, %r9 + cmpl $2, -136(%rbp) + je .L1152 + movq -128(%rbp), %rsi + movq %r14, %r8 + movq %r12, %rcx + movq %r15, %rdx + movq %r13, %rdi + movq %r9, -120(%rbp) + vzeroupper + call _ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + cmpl $3, -136(%rbp) + movq -120(%rbp), %r9 + je .L1136 +.L1061: + subq %r15, %rbx + movq -128(%rbp), %rsi + leaq 0(%r13,%r15,4), %rdi + movq %r14, %r8 + movq %rbx, %rdx + movq %r12, %rcx + call _ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1136: + subq $-128, %rsp + popq %rbx + popq %r10 + .cfi_remember_state + .cfi_def_cfa 10, 0 + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + leaq -8(%r10), %rsp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1050: + .cfi_restore_state + movq (%r12), %rcx + leaq 8(%r11), %rdi + andq $-8, %rdi + movq %rcx, (%r11) + movl %r8d, %ecx + movq -8(%r12,%rcx), %rsi + movq %rsi, -8(%r11,%rcx) + movq %r11, %rcx + movq %r12, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r8d, %ecx + shrl $3, %ecx + rep movsq + jmp .L1051 + .p2align 4,,10 + .p2align 3 +.L1046: + movq (%r11), %rcx + leaq 8(%rax), %rdi + andq $-8, %rdi + movq %rcx, (%rax) + movl %r8d, %ecx + movq -8(%r11,%rcx), %rsi + movq %rsi, -8(%rax,%rcx) + movq %rax, %rcx + movq %r11, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r8d, %ecx + shrl $3, %ecx + rep movsq + jmp .L1047 + .p2align 4,,10 + .p2align 3 +.L1143: + movl (%r12), %ecx + jmp .L1032 + .p2align 4,,10 + .p2align 3 +.L1145: + subq %rsi, %r9 + leaq 0(%r13,%rsi,4), %r11 + leaq (%r12,%r8), %rsi +.L1064: + movq $-1, %rdi + bzhi %r9, %rdi, %rdi + movzwl %di, %edi + kmovd %edi, %k0 +.L1045: + vmovdqu64 (%r11), %zmm1 + vpcmpd $6, %zmm0, %zmm1, %k1 + kandnw %k0, %k1, %k2 + vpcompressd %zmm1, (%rax){%k2} + kmovw %k2, %edi + kandw %k0, %k1, %k1 + popcntq %rdi, %rdi + leaq (%rax,%rdi,4), %rax + kmovw %k1, %r8d + popcntq %r8, %r8 + movq %rax, %r9 + addq %rcx, %r8 + movq %rdx, %rcx + vpcompressd %zmm1, (%rsi){%k1} + subq %r13, %r9 + subq %r8, %rcx + salq $2, %r8 + sarq $2, %r9 + leaq 0(%r13,%rcx,4), %r11 + subq %r9, %rdx + movq %rdx, %r15 + movq %rcx, %rdx + subq %r9, %rdx + leaq (%rax,%r15,4), %rdi + leaq (%rax,%rdx,4), %r10 + vmovq %rdi, %xmm14 + jmp .L1044 + .p2align 4,,10 + .p2align 3 +.L1149: + movzbl (%r12), %esi + movb %sil, (%r11) + testb $2, %cl + je .L1051 + movzwl -2(%r12,%rcx), %esi + movw %si, -2(%r11,%rcx) + jmp .L1051 + .p2align 4,,10 + .p2align 3 +.L1147: + movzbl (%r11), %esi + movb %sil, (%rax) + testb $2, %cl + je .L1047 + movzwl -2(%r11,%rcx), %esi + movw %si, -2(%rax,%rcx) + jmp .L1047 + .p2align 4,,10 + .p2align 3 +.L1040: + leaq -16(%r9), %rsi + leaq 1(%r9), %rdi + andq $-16, %rsi + leaq 0(,%rcx,4), %r8 + addq $16, %rsi + cmpq $16, %rdi + movl $16, %edi + cmovbe %rdi, %rsi + cmpq %r9, %rsi + je .L1042 + subq %rsi, %r9 + leaq 0(%r13,%rsi,4), %r11 + leaq (%r12,%r8), %rsi + cmpq $255, %r9 + jbe .L1064 + movl $65535, %edi + kmovd %edi, %k0 + jmp .L1045 + .p2align 4,,10 + .p2align 3 +.L1141: + cmpq $1, %rdx + jbe .L1136 + leaq 1024(%rdi), %rax + cmpq %rax, %rsi + jb .L1153 + movl $16, %esi + call _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + jmp .L1136 + .p2align 4,,10 + .p2align 3 +.L988: + vmovdqu32 0(%r13), %zmm6 + movl $16, %edi + movq $-1, %rax + subq -144(%rbp), %rdi + bzhi %rdi, %rax, %rax + kmovw %eax, %k6 + vpcmpd $4, %zmm0, %zmm6, %k0 + kandw %k6, %k0, %k0 + kmovw %k0, %eax + kortestw %k0, %k0 + jne .L1154 + vpxor %xmm4, %xmm4, %xmm4 + leaq 1024(%r13,%rdi,4), %rsi + vmovdqa64 %zmm4, %zmm3 + .p2align 4,,10 + .p2align 3 +.L994: + movq %rdi, %rcx + addq $256, %rdi + cmpq %rdi, %rbx + jb .L998 + leaq -1024(%rsi), %rax +.L993: + vpxord (%rax), %zmm0, %zmm1 + vpxord 64(%rax), %zmm0, %zmm2 + leaq 128(%rax), %rdx + vpord %zmm3, %zmm1, %zmm3 + vpord %zmm4, %zmm2, %zmm4 + vpxord 128(%rax), %zmm0, %zmm1 + vpxord 192(%rax), %zmm0, %zmm2 + vpord %zmm3, %zmm1, %zmm3 + vpxord 256(%rax), %zmm0, %zmm1 + vpord %zmm4, %zmm2, %zmm4 + vpxord 320(%rax), %zmm0, %zmm2 + leaq 384(%rdx), %rax + vpord %zmm3, %zmm1, %zmm3 + vpxord 256(%rdx), %zmm0, %zmm1 + vpord %zmm4, %zmm2, %zmm4 + vpxord 320(%rdx), %zmm0, %zmm2 + vpord %zmm3, %zmm1, %zmm1 + vpord %zmm4, %zmm2, %zmm2 + vmovdqa64 %zmm1, %zmm3 + vmovdqa64 %zmm2, %zmm4 + cmpq %rax, %rsi + jne .L993 + vpord %zmm2, %zmm1, %zmm1 + leaq 1408(%rdx), %rsi + vptestnmd %zmm1, %zmm1, %k0 + kortestw %k0, %k0 + setc %al + testb %al, %al + jne .L994 + vmovdqa32 0(%r13,%rcx,4), %zmm6 + vpcmpd $4, %zmm0, %zmm6, %k0 + kortestw %k0, %k0 + jne .L996 + .p2align 4,,10 + .p2align 3 +.L995: + addq $16, %rcx + vmovdqa32 0(%r13,%rcx,4), %zmm7 + vpcmpd $4, %zmm0, %zmm7, %k0 + kortestw %k0, %k0 + je .L995 +.L996: + kmovw %k0, %eax + tzcntl %eax, %eax + addq %rcx, %rax +.L992: + vpbroadcastd 0(%r13,%rax,4), %zmm5 + vmovdqa64 %zmm0, %zmm2 + leaq 0(%r13,%rax,4), %rdi + vpcmpd $6, %zmm0, %zmm5, %k0 + vmovdqa64 %zmm5, %zmm1 + kortestw %k0, %k0 + jne .L1001 + leaq -16(%rbx), %rax + xorl %ecx, %ecx + jmp .L1007 + .p2align 4,,10 + .p2align 3 +.L1002: + kmovw %k0, %edx + popcntq %rdx, %rdx + addq %rdx, %rcx + leaq -16(%rax), %rdx + vmovdqu64 %zmm2, 0(%r13,%rax,4) + cmpq %rdx, %rbx + jbe .L1155 + movq %rdx, %rax +.L1007: + vmovdqu32 0(%r13,%rax,4), %zmm6 + vpcmpd $0, %zmm5, %zmm6, %k1 + vpcmpd $0, %zmm0, %zmm6, %k0 + kmovw %k1, %edx + kmovw %k0, %esi + korw %k0, %k1, %k1 + kortestw %k1, %k1 + jc .L1002 + kmovw %edx, %k5 + kmovw %esi, %k3 + kxnorw %k3, %k5, %k7 + kmovw %k7, %edx + tzcntl %edx, %edx + leaq 16(%rax), %rsi + addq %rax, %rdx + addq $32, %rax + vpbroadcastd 0(%r13,%rdx,4), %zmm2 + movq %rbx, %rdx + subq %rcx, %rdx + vmovdqa64 %zmm2, %zmm0 + vmovdqa64 %zmm2, -112(%rbp) + cmpq %rdx, %rax + ja .L1003 + .p2align 4,,10 + .p2align 3 +.L1004: + vmovdqu64 %zmm1, -64(%r13,%rax,4) + movq %rax, %rsi + addq $16, %rax + cmpq %rdx, %rax + jbe .L1004 +.L1003: + subq %rsi, %rdx + leaq 0(%r13,%rsi,4), %rcx + movl $65535, %eax + cmpq $255, %rdx + ja .L1005 + movq $-1, %rax + bzhi %rdx, %rax, %rax + movzwl %ax, %eax +.L1005: + kmovw %eax, %k3 + vmovdqu32 %zmm5, (%rcx){%k3} +.L1006: + vpbroadcastd (%r12), %zmm3 + vpcmpd $0, .LC8(%rip), %zmm3, %k0 + vmovdqa64 %zmm3, %zmm1 + kortestw %k0, %k0 + jc .L1086 + vpcmpd $0, .LC9(%rip), %zmm3, %k0 + kortestw %k0, %k0 + jc .L1024 + vpminsd %zmm0, %zmm5, %zmm2 + vpcmpd $6, %zmm2, %zmm3, %k0 + kortestw %k0, %k0 + jne .L1156 + vmovdqa64 %zmm3, %zmm2 + movl $256, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1019: + movq %rax, %rdx + addq $1, %rax + salq $4, %rdx + addq %rcx, %rdx + vpminsd 0(%r13,%rdx,4), %zmm2, %zmm0 + vmovdqa64 %zmm0, %zmm2 + cmpq $16, %rax + jne .L1019 + vpcmpd $6, %zmm0, %zmm3, %k0 + kortestw %k0, %k0 + jne .L1140 + leaq 256(%rsi), %rax + cmpq %rax, %rbx + jb .L1026 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1019 + .p2align 4,,10 + .p2align 3 +.L998: + movq %rcx, %rdx + addq $16, %rcx + cmpq %rcx, %rbx + jb .L1157 + vmovdqa32 -64(%r13,%rcx,4), %zmm6 + vpcmpd $4, %zmm0, %zmm6, %k0 + kmovw %k0, %eax + kortestw %k0, %k0 + je .L998 +.L1138: + tzcntl %eax, %eax + addq %rdx, %rax + jmp .L992 + .p2align 4,,10 + .p2align 3 +.L1039: + movq %r12, %rsi + movq %r13, %r11 + movq %r13, %rax + movq %rdx, %r15 + vmovq %r10, %xmm14 + testq %r9, %r9 + jne .L1064 + jmp .L1065 +.L1152: + vzeroupper + jmp .L1061 +.L1091: + movq %rax, %rsi + xorl %ecx, %ecx + jmp .L1055 +.L1090: + movq %rax, %rcx + movq %r15, %rdi + jmp .L1054 +.L1144: + leaq -1(%rbx), %r12 + movq %r12, %r14 + shrq %r14 + .p2align 4,,10 + .p2align 3 +.L1037: + movq %r14, %rdx + movq %rbx, %rsi + movq %r13, %rdi + call _ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + subq $1, %r14 + jnb .L1037 + .p2align 4,,10 + .p2align 3 +.L1038: + movl 0(%r13,%r12,4), %edx + movl 0(%r13), %eax + movq %r12, %rsi + movq %r13, %rdi + movl %edx, 0(%r13) + xorl %edx, %edx + movl %eax, 0(%r13,%r12,4) + call _ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + subq $1, %r12 + jne .L1038 + jmp .L1136 + .p2align 4,,10 + .p2align 3 +.L1027: + vpcmpd $6, -64(%r13,%rsi,4), %zmm3, %k0 + kortestw %k0, %k0 + jne .L1140 +.L1026: + movq %rsi, %rax + addq $16, %rsi + cmpq %rsi, %rbx + jnb .L1027 + cmpq %rax, %rbx + je .L1086 + vpcmpd $6, -64(%r13,%rbx,4), %zmm3, %k0 + xorl %eax, %eax + kortestw %k0, %k0 + sete %al + addl $1, %eax + movl %eax, -136(%rbp) + jmp .L1028 +.L1146: + movl (%r11), %esi + movl %esi, (%rax) + movl -4(%r11,%rcx), %esi + movl %esi, -4(%rax,%rcx) + jmp .L1047 +.L1148: + movl (%r12), %esi + movl %esi, (%r11) + movl -4(%r12,%rcx), %esi + movl %esi, -4(%r11,%rcx) + jmp .L1051 +.L1001: + movq %rbx, %rsi + leaq -112(%rbp), %rdx + movq %r12, %rcx + subq %rax, %rsi + call _ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L1136 + vmovdqa64 -112(%rbp), %zmm0 + jmp .L1006 +.L1155: + movl $65535, %edi + vmovdqu64 0(%r13), %zmm2 + kmovd %edi, %k2 + cmpq $255, %rax + ja .L1008 + movq $-1, %rdx + bzhi %rax, %rdx, %rdx + movzwl %dx, %edi + kmovd %edi, %k2 +.L1008: + vpcmpd $0, %zmm5, %zmm2, %k0 + vpcmpd $0, %zmm0, %zmm2, %k1 + movq %rbx, %rdx + kandw %k2, %k1, %k1 + knotw %k2, %k2 + korw %k1, %k0, %k0 + korw %k2, %k0, %k0 + kortestw %k0, %k0 + setc %sil + subq %rcx, %rdx + testb %sil, %sil + je .L1158 + xorl %ecx, %ecx + kmovw %k1, %eax + vmovdqu64 %zmm0, 0(%r13) + popcntq %rax, %rcx + movq %rdx, %rax + subq %rcx, %rax + cmpq $15, %rax + jbe .L1013 + leaq -16(%rax), %rcx + movq -152(%rbp), %rsi + movq %rcx, %rdx + shrq $4, %rdx + salq $6, %rdx + leaq 64(%r13,%rdx), %rdx + .p2align 4,,10 + .p2align 3 +.L1014: + vmovdqu64 %zmm1, (%rsi) + addq $64, %rsi + cmpq %rdx, %rsi + jne .L1014 + andq $-16, %rcx + movl $65535, %ebx + vmovdqa64 %zmm5, (%r12) + leaq 16(%rcx), %rdx + kmovd %ebx, %k1 + subq %rdx, %rax + leaq 0(%r13,%rdx,4), %r13 + cmpq $255, %rax + jbe .L1066 +.L1015: + vmovdqu32 (%r12), %zmm0{%k1}{z} + vmovdqu32 %zmm0, 0(%r13){%k1} + vzeroupper + jmp .L1136 +.L1023: + vmovdqu32 -64(%r13,%rsi,4), %zmm7 + vpcmpd $6, %zmm3, %zmm7, %k0 + kortestw %k0, %k0 + jne .L1140 +.L1022: + movq %rsi, %rax + addq $16, %rsi + cmpq %rsi, %rbx + jnb .L1023 + cmpq %rax, %rbx + je .L1024 + vmovdqu32 -64(%r13,%rbx,4), %zmm7 + vpcmpd $6, %zmm3, %zmm7, %k0 + kortestw %k0, %k0 + jne .L1140 +.L1024: + movl $3, -136(%rbp) + vpternlogd $0xFF, %zmm1, %zmm1, %zmm1 + vpaddd %zmm1, %zmm3, %zmm1 + jmp .L1028 +.L1157: + leaq -16(%rbx), %rdx + vmovdqu32 0(%r13,%rdx,4), %zmm7 + vpcmpd $4, %zmm0, %zmm7, %k0 + kmovw %k0, %eax + kortestw %k0, %k0 + jne .L1138 + vzeroupper + jmp .L1136 +.L1154: + tzcntl %eax, %eax + jmp .L992 +.L1153: + xorl %eax, %eax + cmpq $15, %rdx + jbe .L980 + leaq -16(%rdx), %rdx + movq (%rdi), %rcx + movq %rdx, %rax + shrq $4, %rax + movq %rcx, (%r12) + addq $1, %rax + salq $6, %rax + movl %eax, %ecx + movq -8(%rdi,%rcx), %rsi + leaq 8(%r12), %rdi + andq $-8, %rdi + movq %rsi, -8(%r12,%rcx) + movq %r12, %rcx + movq %r13, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %eax, %ecx + movq %rdx, %rax + shrl $3, %ecx + andq $-16, %rax + rep movsq + addq $16, %rax +.L980: + movq %rbx, %rdx + leaq 0(,%rax,4), %r14 + subq %rax, %rdx + movl $65535, %eax + leaq (%r12,%r14), %r15 + addq %r13, %r14 + kmovd %eax, %k4 + cmpq $255, %rdx + ja .L981 + movq $-1, %rax + bzhi %rdx, %rax, %rax + movzwl %ax, %eax + kmovd %eax, %k4 +.L981: + leal -1(%rbx), %eax + movl $32, %edx + movl $1, %esi + vmovdqu32 (%r14), %zmm0{%k4}{z} + bsrl %eax, %eax + xorl $31, %eax + vmovdqu32 %zmm0, (%r15){%k4} + vpbroadcastq .LC10(%rip), %zmm0 + subl %eax, %edx + movl $1, %eax + shlx %rdx, %rsi, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %rbx, %rax + leaq 1(%rsi), %rdx + salq $4, %rdx + cmpq %rdx, %rbx + jnb .L985 +.L982: + vmovdqu64 %zmm0, (%r12,%rax,4) + addq $16, %rax + cmpq %rdx, %rax + jb .L982 +.L985: + movq %r12, %rdi + vzeroupper + call _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + cmpq $15, %rbx + jbe .L984 + leaq -16(%rbx), %rax + movq (%r12), %rdx + leaq 8(%r13), %rdi + movq %r12, %rsi + shrq $4, %rax + andq $-8, %rdi + addq $1, %rax + movq %rdx, 0(%r13) + salq $6, %rax + movl %eax, %edx + movq -8(%r12,%rdx), %rcx + movq %rcx, -8(%r13,%rdx) + subq %rdi, %r13 + leal (%rax,%r13), %ecx + subq %r13, %rsi + shrl $3, %ecx + rep movsq +.L984: + vmovdqu32 (%r15), %zmm0{%k4}{z} + vmovdqu32 %zmm0, (%r14){%k4} + vzeroupper + jmp .L1136 +.L1013: + vmovdqa64 %zmm5, (%r12) +.L1066: + movq $-1, %rdx + bzhi %rax, %rdx, %rax + movzwl %ax, %eax + kmovd %eax, %k1 + jmp .L1015 +.L1086: + movl $2, -136(%rbp) + jmp .L1028 +.L1156: + vpmaxsd %zmm0, %zmm5, %zmm5 + vpcmpd $6, %zmm3, %zmm5, %k0 + kortestw %k0, %k0 + jne .L1140 + vmovdqa64 %zmm3, %zmm2 + movl $256, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1020: + movq %rax, %rdx + addq $1, %rax + salq $4, %rdx + addq %rcx, %rdx + vpmaxsd 0(%r13,%rdx,4), %zmm2, %zmm0 + vmovdqa64 %zmm0, %zmm2 + cmpq $16, %rax + jne .L1020 + vpcmpd $6, %zmm3, %zmm0, %k0 + kortestw %k0, %k0 + jne .L1140 + leaq 256(%rsi), %rax + cmpq %rax, %rbx + jb .L1022 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1020 +.L1158: + knotw %k0, %k7 + kmovw %k7, %ecx + tzcntl %ecx, %ecx + vpbroadcastd 0(%r13,%rcx,4), %zmm2 + leaq 16(%rax), %rcx + vmovdqa64 %zmm2, %zmm0 + vmovdqa64 %zmm2, -112(%rbp) + cmpq %rdx, %rcx + ja .L1010 +.L1011: + vmovdqu64 %zmm1, -64(%r13,%rcx,4) + movq %rcx, %rax + addq $16, %rcx + cmpq %rdx, %rcx + jbe .L1011 +.L1010: + subq %rax, %rdx + leaq 0(%r13,%rax,4), %rsi + movl $-1, %ecx + cmpq $255, %rdx + ja .L1012 + orq $-1, %rcx + bzhi %rdx, %rcx, %rcx +.L1012: + kmovw %ecx, %k5 + vmovdqu32 %zmm5, (%rsi){%k5} + jmp .L1006 + .cfi_endproc +.LFE18802: + .size _ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0, .-_ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + .section .text._ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0, @function +_ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0: +.LFB18804: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + pushq %r13 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + .cfi_offset 13, -40 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -48 + movq %rcx, %r12 + pushq %rbx + subq $88, %rsp + .cfi_offset 3, -56 + movq %rsi, -96(%rbp) + movq %rdx, -72(%rbp) + movq %r8, -80(%rbp) + movq %r9, -88(%rbp) + cmpq $64, %rdx + jbe .L1449 + movq %rdi, %rax + movq %rdi, -120(%rbp) + shrq $2, %rax + movq %rax, %rdx + movq %rax, -112(%rbp) + andl $15, %edx + jne .L1450 + movq -72(%rbp), %r14 + movq %rdi, %rax + movq %r8, %r15 +.L1172: + movq 8(%r15), %rcx + movq 16(%r15), %r10 + movq %rcx, %rsi + leaq 1(%r10), %r8 + movq %rcx, %rdi + xorq (%r15), %r8 + rolq $24, %rsi + shrq $11, %rdi + leaq (%rcx,%rcx,8), %rdx + leaq 2(%r10), %rcx + addq %r8, %rsi + xorq %rdi, %rdx + movq %rsi, %rdi + xorq %rcx, %rdx + leaq (%rsi,%rsi,8), %rcx + movq %rsi, %r9 + rolq $24, %rdi + shrq $11, %r9 + leaq 3(%r10), %rsi + addq %rdx, %rdi + xorq %r9, %rcx + movq %rdi, %r11 + xorq %rsi, %rcx + leaq (%rdi,%rdi,8), %rsi + movq %rdi, %r9 + rolq $24, %r11 + shrq $11, %r9 + leaq 4(%r10), %rdi + addq $5, %r10 + addq %rcx, %r11 + xorq %r9, %rsi + movq %r10, 16(%r15) + movq %r11, %r9 + xorq %rdi, %rsi + leaq (%r11,%r11,8), %rdi + movq %r11, %rbx + rolq $24, %r9 + shrq $11, %rbx + addq %rsi, %r9 + xorq %rbx, %rdi + movq %r9, %rbx + leaq (%r9,%r9,8), %r11 + xorq %r10, %rdi + rolq $24, %r9 + shrq $11, %rbx + addq %rdi, %r9 + movl %r8d, %r10d + movl %edi, %edi + xorq %rbx, %r11 + movq %r14, %rbx + movq %r11, %xmm0 + shrq $4, %rbx + movl %ecx, %r11d + pinsrq $1, %r9, %xmm0 + movabsq $68719476719, %r9 + cmpq %r9, %r14 + movl $4294967295, %r9d + movups %xmm0, (%r15) + movl %edx, %r15d + cmova %r9, %rbx + shrq $32, %r8 + movl %esi, %r9d + shrq $32, %rdx + imulq %rbx, %r15 + shrq $32, %rcx + shrq $32, %rsi + imulq %rbx, %r10 + imulq %rbx, %r8 + shrq $32, %r15 + imulq %rbx, %rdx + imulq %rbx, %r11 + shrq $32, %r10 + imulq %rbx, %rcx + shrq $32, %r8 + imulq %rbx, %r9 + shrq $32, %rdx + imulq %rbx, %rsi + shrq $32, %r11 + imulq %rbx, %rdi + movq %r15, %rbx + shrq $32, %rcx + salq $6, %rbx + shrq $32, %r9 + movdqa (%rax,%rbx), %xmm1 + movq %r10, %rbx + shrq $32, %rsi + salq $6, %rbx + shrq $32, %rdi + movdqa (%rax,%rbx), %xmm0 + movq %r8, %rbx + salq $4, %r10 + salq $6, %rbx + movdqa %xmm0, %xmm4 + pmaxsd %xmm1, %xmm0 + pminsd %xmm1, %xmm4 + pmaxsd (%rax,%rbx), %xmm4 + movq %rcx, %rbx + salq $6, %rbx + movdqa (%rax,%rbx), %xmm1 + movq %rdx, %rbx + pminsd %xmm0, %xmm4 + salq $6, %rbx + movaps %xmm4, (%r12) + movdqa (%rax,%rbx), %xmm0 + movq %r11, %rbx + salq $6, %rbx + movdqa %xmm0, %xmm12 + pmaxsd %xmm1, %xmm0 + pminsd %xmm1, %xmm12 + pmaxsd (%rax,%rbx), %xmm12 + movq %rdi, %rbx + salq $6, %rbx + movdqa (%rax,%rbx), %xmm1 + movq %r9, %rbx + pminsd %xmm0, %xmm12 + salq $6, %rbx + movaps %xmm12, 64(%r12) + movdqa (%rax,%rbx), %xmm0 + movq %rsi, %rbx + salq $6, %rbx + addq $4, %r10 + salq $4, %r15 + movdqa %xmm0, %xmm8 + addq $4, %r15 + pmaxsd %xmm1, %xmm0 + salq $4, %r8 + pminsd %xmm1, %xmm8 + movdqa (%rax,%r10,4), %xmm1 + salq $4, %rdx + salq $4, %rcx + pmaxsd (%rax,%rbx), %xmm8 + addq $4, %rdx + addq $4, %rcx + salq $4, %r11 + movdqa (%rax,%r15,4), %xmm2 + salq $4, %r9 + salq $4, %rdi + leaq 0(,%r15,4), %r14 + pminsd %xmm0, %xmm8 + movdqa %xmm1, %xmm0 + addq $4, %r9 + addq $4, %rdi + pminsd %xmm2, %xmm0 + pmaxsd 16(%rax,%r8,4), %xmm0 + salq $4, %rsi + movaps %xmm8, 128(%r12) + pmaxsd %xmm2, %xmm1 + movdqa (%rax,%rcx,4), %xmm2 + leaq 0(,%r10,4), %rbx + pminsd %xmm1, %xmm0 + movdqa (%rax,%rdx,4), %xmm1 + movdqa 16(%rax,%rbx), %xmm14 + leaq 0(,%rdx,4), %r10 + pminsd 16(%rax,%r14), %xmm14 + movaps %xmm0, 16(%r12) + leaq 0(,%rcx,4), %r15 + movdqa %xmm1, %xmm11 + pmaxsd %xmm2, %xmm1 + leaq 0(,%r9,4), %rdx + pminsd %xmm2, %xmm11 + pmaxsd 16(%rax,%r11,4), %xmm11 + movdqa (%rax,%rdi,4), %xmm2 + leaq 0(,%rdi,4), %rcx + pminsd %xmm1, %xmm11 + movdqa (%rax,%r9,4), %xmm1 + movaps %xmm11, 80(%r12) + movdqa %xmm1, %xmm7 + pmaxsd %xmm2, %xmm1 + pminsd %xmm2, %xmm7 + pmaxsd 16(%rax,%rsi,4), %xmm7 + pminsd %xmm1, %xmm7 + movdqa 16(%rax,%rbx), %xmm1 + movaps %xmm7, 144(%r12) + pmaxsd 16(%rax,%r14), %xmm1 + movdqa 16(%rax,%r10), %xmm10 + pmaxsd 32(%rax,%r8,4), %xmm14 + pminsd 16(%rax,%r15), %xmm10 + pmaxsd 32(%rax,%r11,4), %xmm10 + movdqa 16(%rax,%rdx), %xmm6 + pminsd %xmm1, %xmm14 + movdqa 16(%rax,%r10), %xmm1 + pmaxsd 16(%rax,%r15), %xmm1 + pminsd 16(%rax,%rcx), %xmm6 + pmaxsd 32(%rax,%rsi,4), %xmm6 + movaps %xmm14, 32(%r12) + pminsd %xmm1, %xmm10 + movdqa 16(%rax,%rdx), %xmm1 + pmaxsd 16(%rax,%rcx), %xmm1 + movdqa 32(%rax,%r14), %xmm2 + movaps %xmm10, 96(%r12) + leaq 192(%r12), %r14 + pminsd %xmm1, %xmm6 + movdqa 32(%rax,%rbx), %xmm1 + movaps %xmm6, 160(%r12) + movdqa %xmm1, %xmm13 + pmaxsd %xmm2, %xmm1 + pminsd %xmm2, %xmm13 + pmaxsd 48(%rax,%r8,4), %xmm13 + movdqa 32(%rax,%r15), %xmm2 + pminsd %xmm1, %xmm13 + movdqa 32(%rax,%r10), %xmm1 + movaps %xmm13, 48(%r12) + movdqa %xmm1, %xmm9 + pmaxsd %xmm2, %xmm1 + pminsd %xmm2, %xmm9 + pmaxsd 48(%rax,%r11,4), %xmm9 + movdqa 32(%rax,%rcx), %xmm2 + pminsd %xmm1, %xmm9 + movdqa 32(%rax,%rdx), %xmm1 + movaps %xmm9, 112(%r12) + movdqa %xmm1, %xmm5 + pmaxsd %xmm2, %xmm1 + pminsd %xmm2, %xmm5 + pmaxsd 48(%rax,%rsi,4), %xmm5 + pshufd $0, %xmm4, %xmm2 + pxor %xmm2, %xmm4 + pxor %xmm2, %xmm14 + pxor %xmm2, %xmm13 + pminsd %xmm1, %xmm5 + movdqa %xmm0, %xmm1 + pxor %xmm2, %xmm12 + pxor %xmm2, %xmm1 + pxor %xmm2, %xmm11 + pxor %xmm2, %xmm10 + movaps %xmm5, 176(%r12) + por %xmm4, %xmm1 + pxor %xmm2, %xmm9 + pxor %xmm2, %xmm8 + movdqa 192(%r12), %xmm4 + por %xmm14, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm2, %xmm6 + por %xmm13, %xmm1 + pxor %xmm2, %xmm5 + pxor %xmm2, %xmm4 + por %xmm12, %xmm1 + pxor %xmm0, %xmm0 + movdqa %xmm2, %xmm3 + por %xmm11, %xmm1 + por %xmm10, %xmm1 + por %xmm9, %xmm1 + por %xmm8, %xmm1 + por %xmm7, %xmm1 + por %xmm6, %xmm1 + por %xmm5, %xmm1 + por %xmm1, %xmm4 + pblendvb %xmm0, %xmm4, %xmm1 + pxor %xmm0, %xmm0 + pcmpeqd %xmm0, %xmm1 + movmskps %xmm1, %eax + cmpl $15, %eax + je .L1174 + movdqa .LC4(%rip), %xmm0 + movl $4, %esi + movq %r12, %rdi + movups %xmm0, 192(%r12) + movups %xmm0, 208(%r12) + movups %xmm0, 224(%r12) + movups %xmm0, 240(%r12) + movups %xmm0, 256(%r12) + call _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + movd (%r12), %xmm6 + pcmpeqd %xmm2, %xmm2 + movd 188(%r12), %xmm5 + pshufd $0, %xmm5, %xmm1 + pshufd $0, %xmm6, %xmm0 + paddd %xmm1, %xmm2 + pcmpeqd %xmm0, %xmm2 + movmskps %xmm2, %eax + cmpl $15, %eax + jne .L1176 + movq -72(%rbp), %rsi + leaq -64(%rbp), %rdx + movq %r14, %rcx + movq %r13, %rdi + call _ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L1159 +.L1176: + movl 96(%r12), %ecx + movl $23, %eax + movl $24, %edx + cmpl %ecx, 92(%r12) + je .L1218 + jmp .L1223 + .p2align 4,,10 + .p2align 3 +.L1221: + testq %rax, %rax + je .L1451 +.L1218: + movq %rax, %rdx + subq $1, %rax + movl (%r12,%rax,4), %esi + cmpl %esi, %ecx + je .L1221 + cmpl (%r12,%rdx,4), %ecx + je .L1223 + movl %esi, %ecx + jmp .L1220 + .p2align 4,,10 + .p2align 3 +.L1224: + cmpq $47, %rdx + je .L1442 +.L1223: + movq %rdx, %rsi + addq $1, %rdx + cmpl (%r12,%rdx,4), %ecx + je .L1224 + movl $24, %edx + subq $23, %rsi + subq %rax, %rdx + cmpq %rdx, %rsi + jb .L1220 +.L1442: + movl (%r12,%rax,4), %ecx +.L1220: + movd %ecx, %xmm6 + pshufd $0, %xmm6, %xmm1 +.L1443: + movl $1, -112(%rbp) +.L1217: + cmpq $0, -88(%rbp) + je .L1452 + movq -72(%rbp), %rax + movdqa %xmm1, %xmm4 + leaq -4(%rax), %r10 + movq %r10, %rdx + movq %r10, %rcx + movdqu 0(%r13,%r10,4), %xmm6 + andl $15, %edx + andl $12, %ecx + je .L1290 + movdqu 0(%r13), %xmm2 + pcmpeqd %xmm0, %xmm0 + xorl %edi, %edi + movdqa .LC0(%rip), %xmm5 + leaq _ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %r9 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm1, %xmm3 + movdqa %xmm2, %xmm1 + pxor %xmm3, %xmm0 + movmskps %xmm0, %eax + popcntq %rax, %rdi + movq %rdi, %xmm0 + salq $4, %rax + movq %rdi, %rcx + pshufd $0, %xmm0, %xmm0 + pshufb (%r9,%rax), %xmm1 + pcmpgtd %xmm5, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L1228 + movd %xmm1, 0(%r13) +.L1228: + pextrd $1, %xmm0, %eax + testl %eax, %eax + je .L1229 + pextrd $1, %xmm1, 4(%r13) +.L1229: + pextrd $2, %xmm0, %eax + testl %eax, %eax + je .L1230 + pextrd $2, %xmm1, 8(%r13) +.L1230: + pextrd $3, %xmm0, %eax + testl %eax, %eax + je .L1231 + pextrd $3, %xmm1, 12(%r13) +.L1231: + leaq 0(%r13,%rcx,4), %rax + xorl %esi, %esi + movmskps %xmm3, %ecx + popcntq %rcx, %rsi + salq $4, %rcx + pshufb (%r9,%rcx), %xmm2 + movups %xmm2, (%r12) + testb $8, %r10b + je .L1232 + movdqu 16(%r13), %xmm1 + pcmpeqd %xmm0, %xmm0 + xorl %edi, %edi + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pcmpgtd %xmm4, %xmm2 + pxor %xmm2, %xmm0 + movmskps %xmm0, %ecx + popcntq %rcx, %rdi + movq %rdi, %xmm0 + salq $4, %rcx + pshufd $0, %xmm0, %xmm0 + pshufb (%r9,%rcx), %xmm3 + pcmpgtd %xmm5, %xmm0 + movd %xmm0, %ecx + testl %ecx, %ecx + je .L1233 + movd %xmm3, (%rax) +.L1233: + pextrd $1, %xmm0, %ecx + testl %ecx, %ecx + je .L1234 + pextrd $1, %xmm3, 4(%rax) +.L1234: + pextrd $2, %xmm0, %ecx + testl %ecx, %ecx + je .L1235 + pextrd $2, %xmm3, 8(%rax) +.L1235: + pextrd $3, %xmm0, %ecx + testl %ecx, %ecx + je .L1236 + pextrd $3, %xmm3, 12(%rax) +.L1236: + movmskps %xmm2, %ecx + leaq (%rax,%rdi,4), %rax + movq %rcx, %rdi + popcntq %rcx, %rcx + salq $4, %rdi + pshufb (%r9,%rdi), %xmm1 + movups %xmm1, (%r12,%rsi,4) + addq %rcx, %rsi + cmpq $11, %rdx + jbe .L1232 + movdqu 32(%r13), %xmm1 + pcmpeqd %xmm0, %xmm0 + xorl %edi, %edi + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm3 + pcmpgtd %xmm4, %xmm2 + pxor %xmm2, %xmm0 + movmskps %xmm0, %ecx + popcntq %rcx, %rdi + movq %rdi, %xmm0 + salq $4, %rcx + pshufd $0, %xmm0, %xmm0 + pshufb (%r9,%rcx), %xmm3 + pcmpgtd %xmm5, %xmm0 + movd %xmm0, %ecx + testl %ecx, %ecx + je .L1237 + movd %xmm3, (%rax) +.L1237: + pextrd $1, %xmm0, %ecx + testl %ecx, %ecx + je .L1238 + pextrd $1, %xmm3, 4(%rax) +.L1238: + pextrd $2, %xmm0, %ecx + testl %ecx, %ecx + je .L1239 + pextrd $2, %xmm3, 8(%rax) +.L1239: + pextrd $3, %xmm0, %ecx + testl %ecx, %ecx + je .L1240 + pextrd $3, %xmm3, 12(%rax) +.L1240: + movmskps %xmm2, %ecx + leaq (%rax,%rdi,4), %rax + movq %rcx, %rdi + popcntq %rcx, %rcx + salq $4, %rdi + pshufb (%r9,%rdi), %xmm1 + movups %xmm1, (%r12,%rsi,4) + addq %rcx, %rsi +.L1232: + leaq -4(%rdx), %rcx + leaq 1(%rdx), %rdi + andq $-4, %rcx + leaq 0(,%rsi,4), %r8 + addq $4, %rcx + cmpq $4, %rdi + movl $4, %edi + cmovbe %rdi, %rcx +.L1227: + cmpq %rdx, %rcx + je .L1241 + movdqu 0(%r13,%rcx,4), %xmm2 + subq %rcx, %rdx + xorl %edi, %edi + movd %edx, %xmm7 + movdqa %xmm2, %xmm3 + pshufd $0, %xmm7, %xmm0 + movdqa %xmm2, %xmm7 + pcmpgtd %xmm4, %xmm3 + pcmpgtd %xmm5, %xmm0 + movdqa %xmm3, %xmm1 + pandn %xmm0, %xmm1 + movmskps %xmm1, %edx + popcntq %rdx, %rdi + movq %rdi, %xmm1 + salq $4, %rdx + movq %rdi, %rcx + pshufd $0, %xmm1, %xmm1 + pshufb (%r9,%rdx), %xmm7 + pcmpgtd %xmm5, %xmm1 + movd %xmm1, %edx + testl %edx, %edx + je .L1242 + movd %xmm7, (%rax) +.L1242: + pextrd $1, %xmm1, %edx + testl %edx, %edx + je .L1243 + pextrd $1, %xmm7, 4(%rax) +.L1243: + pextrd $2, %xmm1, %edx + testl %edx, %edx + je .L1244 + pextrd $2, %xmm7, 8(%rax) +.L1244: + pextrd $3, %xmm1, %edx + testl %edx, %edx + je .L1245 + pextrd $3, %xmm7, 12(%rax) +.L1245: + pand %xmm0, %xmm3 + leaq (%rax,%rcx,4), %rax + movmskps %xmm3, %edx + movq %rdx, %rcx + popcntq %rdx, %rdx + addq %rdx, %rsi + salq $4, %rcx + pshufb (%r9,%rcx), %xmm2 + movups %xmm2, (%r12,%r8) + leaq 0(,%rsi,4), %r8 +.L1241: + movq %r10, %rdx + movl %r8d, %ecx + subq %rsi, %rdx + leaq 0(%r13,%rdx,4), %r11 + cmpl $8, %r8d + jnb .L1246 + testb $4, %r8b + jne .L1453 + testl %ecx, %ecx + jne .L1454 +.L1247: + movl %r8d, %ecx + cmpl $8, %r8d + jnb .L1250 + andl $4, %r8d + jne .L1455 + testl %ecx, %ecx + jne .L1456 +.L1251: + movq %rax, %rcx + movq %r10, %r14 + subq %r13, %rcx + sarq $2, %rcx + subq %rcx, %r14 + subq %rcx, %rdx + movq %rcx, %r15 + leaq (%rax,%rdx,4), %rcx + je .L1291 + leaq 64(%rax), %rsi + leaq -64(%rcx), %r10 + movdqu (%rax), %xmm14 + movdqu 16(%rax), %xmm13 + movdqu 32(%rax), %xmm12 + movdqu 48(%rax), %xmm11 + movdqu -64(%rcx), %xmm10 + movdqu -48(%rcx), %xmm9 + movdqu -32(%rcx), %xmm8 + movdqu -16(%rcx), %xmm7 + cmpq %r10, %rsi + je .L1292 + xorl %ecx, %ecx + leaq _ZZN3hwy6N_SSE46detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %rdi + movl $4, %r11d + jmp .L1258 + .p2align 4,,10 + .p2align 3 +.L1458: + movdqu -64(%r10), %xmm3 + movdqu -48(%r10), %xmm2 + prefetcht0 -256(%r10) + subq $64, %r10 + movdqu 32(%r10), %xmm1 + movdqu 48(%r10), %xmm0 +.L1257: + movdqa %xmm3, %xmm15 + pcmpgtd %xmm4, %xmm15 + movmskps %xmm15, %r8d + movq %r8, %rbx + popcntq %r8, %r8 + salq $4, %rbx + pshufb (%rdi,%rbx), %xmm3 + leaq -4(%rcx,%rdx), %rbx + movups %xmm3, (%rax,%rcx,4) + addq $4, %rcx + movups %xmm3, (%rax,%rbx,4) + movdqa %xmm2, %xmm3 + subq %r8, %rcx + pcmpgtd %xmm4, %xmm3 + movmskps %xmm3, %r8d + movq %r8, %rbx + popcntq %r8, %r8 + salq $4, %rbx + pshufb (%rdi,%rbx), %xmm2 + leaq -8(%rcx,%rdx), %rbx + movups %xmm2, (%rax,%rcx,4) + movups %xmm2, (%rax,%rbx,4) + movdqa %xmm1, %xmm2 + movq %r11, %rbx + pcmpgtd %xmm4, %xmm2 + subq %r8, %rbx + addq %rbx, %rcx + movmskps %xmm2, %r8d + movq %r8, %rbx + popcntq %r8, %r8 + salq $4, %rbx + pshufb (%rdi,%rbx), %xmm1 + leaq -12(%rcx,%rdx), %rbx + subq $16, %rdx + movups %xmm1, (%rax,%rcx,4) + movups %xmm1, (%rax,%rbx,4) + movdqa %xmm0, %xmm1 + movq %r11, %rbx + pcmpgtd %xmm4, %xmm1 + subq %r8, %rbx + leaq (%rbx,%rcx), %r8 + movmskps %xmm1, %ecx + movq %rcx, %rbx + popcntq %rcx, %rcx + salq $4, %rbx + pshufb (%rdi,%rbx), %xmm0 + leaq (%r8,%rdx), %rbx + movups %xmm0, (%rax,%r8,4) + movups %xmm0, (%rax,%rbx,4) + movq %r11, %rbx + subq %rcx, %rbx + leaq (%rbx,%r8), %rcx + cmpq %r10, %rsi + je .L1457 +.L1258: + movq %rsi, %r8 + subq %rax, %r8 + sarq $2, %r8 + subq %rcx, %r8 + cmpq $16, %r8 + ja .L1458 + movdqu (%rsi), %xmm3 + movdqu 16(%rsi), %xmm2 + prefetcht0 256(%rsi) + addq $64, %rsi + movdqu -32(%rsi), %xmm1 + movdqu -16(%rsi), %xmm0 + jmp .L1257 + .p2align 4,,10 + .p2align 3 +.L1450: + movl $16, %eax + movq %r8, %r15 + subq %rdx, %rax + leaq (%rdi,%rax,4), %rax + movq -72(%rbp), %rdi + leaq -16(%rdx,%rdi), %r14 + jmp .L1172 + .p2align 4,,10 + .p2align 3 +.L1457: + leaq (%rax,%rcx,4), %r10 + leaq (%rdx,%rcx), %r8 + addq $4, %rcx +.L1255: + movdqa %xmm14, %xmm0 + pcmpgtd %xmm4, %xmm0 + movmskps %xmm0, %esi + movdqa %xmm13, %xmm0 + pcmpgtd %xmm4, %xmm0 + movq %rsi, %r11 + popcntq %rsi, %rsi + subq %rsi, %rcx + salq $4, %r11 + pshufb (%rdi,%r11), %xmm14 + movmskps %xmm0, %esi + movdqa %xmm12, %xmm0 + movups %xmm14, (%r10) + pcmpgtd %xmm4, %xmm0 + movups %xmm14, -16(%rax,%r8,4) + movq %rsi, %r8 + popcntq %rsi, %rsi + salq $4, %r8 + pshufb (%rdi,%r8), %xmm13 + leaq -8(%rdx,%rcx), %r8 + movups %xmm13, (%rax,%rcx,4) + subq %rsi, %rcx + movups %xmm13, (%rax,%r8,4) + movmskps %xmm0, %r8d + addq $4, %rcx + movdqa %xmm11, %xmm0 + movq %r8, %rsi + pcmpgtd %xmm4, %xmm0 + popcntq %r8, %r8 + salq $4, %rsi + pshufb (%rdi,%rsi), %xmm12 + leaq -12(%rdx,%rcx), %rsi + movups %xmm12, (%rax,%rcx,4) + movups %xmm12, (%rax,%rsi,4) + movl $4, %esi + movq %rsi, %r10 + subq %r8, %r10 + movmskps %xmm0, %r8d + movdqa %xmm10, %xmm0 + addq %r10, %rcx + pcmpgtd %xmm4, %xmm0 + movq %r8, %r10 + popcntq %r8, %r8 + salq $4, %r10 + pshufb (%rdi,%r10), %xmm11 + leaq -16(%rdx,%rcx), %r10 + movups %xmm11, (%rax,%rcx,4) + movups %xmm11, (%rax,%r10,4) + movq %rsi, %r10 + subq %r8, %r10 + leaq (%r10,%rcx), %r8 + movmskps %xmm0, %ecx + movdqa %xmm9, %xmm0 + movq %rcx, %r10 + pcmpgtd %xmm4, %xmm0 + popcntq %rcx, %rcx + salq $4, %r10 + pshufb (%rdi,%r10), %xmm10 + leaq -20(%rdx,%r8), %r10 + movups %xmm10, (%rax,%r8,4) + movups %xmm10, (%rax,%r10,4) + movq %rsi, %r10 + subq %rcx, %r10 + leaq (%r10,%r8), %rcx + movmskps %xmm0, %r8d + movdqa %xmm8, %xmm0 + movq %r8, %r10 + pcmpgtd %xmm4, %xmm0 + popcntq %r8, %r8 + salq $4, %r10 + pshufb (%rdi,%r10), %xmm9 + leaq -24(%rdx,%rcx), %r10 + movups %xmm9, (%rax,%rcx,4) + movups %xmm9, (%rax,%r10,4) + movq %rsi, %r10 + subq %r8, %r10 + leaq (%r10,%rcx), %r8 + movmskps %xmm0, %ecx + movdqa %xmm7, %xmm0 + movq %rcx, %r10 + pcmpgtd %xmm4, %xmm0 + salq $4, %r10 + pshufb (%rdi,%r10), %xmm8 + leaq -28(%rdx,%r8), %r10 + movups %xmm8, (%rax,%r8,4) + movups %xmm8, (%rax,%r10,4) + xorl %r10d, %r10d + popcntq %rcx, %r10 + movq %rsi, %rcx + subq %r10, %rcx + addq %r8, %rcx + movmskps %xmm0, %r8d + movq %r8, %r10 + leaq -32(%rdx,%rcx), %rdx + popcntq %r8, %r8 + subq %r8, %rsi + salq $4, %r10 + pshufb (%rdi,%r10), %xmm7 + movq %r14, %rdi + movups %xmm7, (%rax,%rcx,4) + movups %xmm7, (%rax,%rdx,4) + leaq (%rsi,%rcx), %rdx + subq %rdx, %rdi + leaq 0(,%rdx,4), %rcx +.L1254: + movdqa %xmm6, %xmm1 + pcmpeqd %xmm0, %xmm0 + movdqa %xmm6, %xmm2 + cmpq $4, %rdi + pcmpgtd %xmm4, %xmm1 + leaq -16(,%r14,4), %rsi + cmovnb %rcx, %rsi + xorl %edi, %edi + pxor %xmm1, %xmm0 + movdqu (%rax,%rsi), %xmm7 + movmskps %xmm0, %esi + popcntq %rsi, %rdi + movq %rdi, %xmm0 + salq $4, %rsi + movups %xmm7, (%rax,%r14,4) + pshufd $0, %xmm0, %xmm0 + pshufb (%r9,%rsi), %xmm2 + pcmpgtd %xmm5, %xmm0 + movd %xmm0, %esi + testl %esi, %esi + je .L1260 + movd %xmm2, (%rax,%rcx) +.L1260: + pextrd $1, %xmm0, %esi + testl %esi, %esi + je .L1261 + pextrd $1, %xmm2, 4(%rax,%rcx) +.L1261: + pextrd $2, %xmm0, %esi + testl %esi, %esi + je .L1262 + pextrd $2, %xmm2, 8(%rax,%rcx) +.L1262: + pextrd $3, %xmm0, %esi + testl %esi, %esi + je .L1263 + pextrd $3, %xmm2, 12(%rax,%rcx) +.L1263: + addq %rdx, %rdi + xorl %esi, %esi + movmskps %xmm1, %edx + popcntq %rdx, %rsi + movq %rsi, %xmm0 + salq $4, %rdx + leaq 0(,%rdi,4), %rcx + pshufd $0, %xmm0, %xmm0 + pshufb (%r9,%rdx), %xmm6 + pcmpgtd %xmm5, %xmm0 + movd %xmm0, %edx + testl %edx, %edx + je .L1264 + movd %xmm6, (%rax,%rdi,4) +.L1264: + pextrd $1, %xmm0, %edx + testl %edx, %edx + je .L1265 + pextrd $1, %xmm6, 4(%rax,%rcx) +.L1265: + pextrd $2, %xmm0, %edx + testl %edx, %edx + je .L1266 + pextrd $2, %xmm6, 8(%rax,%rcx) +.L1266: + pextrd $3, %xmm0, %edx + testl %edx, %edx + je .L1267 + pextrd $3, %xmm6, 12(%rax,%rcx) +.L1267: + movq -88(%rbp), %r14 + addq %rdi, %r15 + subq $1, %r14 + cmpl $2, -112(%rbp) + je .L1269 + movq -80(%rbp), %r8 + movq -96(%rbp), %rsi + movq %r14, %r9 + movq %r12, %rcx + movq %r15, %rdx + movq %r13, %rdi + call _ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + cmpl $3, -112(%rbp) + je .L1159 +.L1269: + movq -72(%rbp), %rdx + movq -80(%rbp), %r8 + leaq 0(%r13,%r15,4), %rdi + movq %r14, %r9 + movq -96(%rbp), %rsi + movq %r12, %rcx + subq %r15, %rdx + call _ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1159: + addq $88, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1250: + .cfi_restore_state + movq (%r12), %rcx + leaq 8(%r11), %rdi + andq $-8, %rdi + movq %rcx, (%r11) + movl %r8d, %ecx + movq -8(%r12,%rcx), %rsi + movq %rsi, -8(%r11,%rcx) + subq %rdi, %r11 + movq %r12, %rsi + leal (%r8,%r11), %ecx + subq %r11, %rsi + shrl $3, %ecx + rep movsq + jmp .L1251 + .p2align 4,,10 + .p2align 3 +.L1246: + movq (%r11), %rcx + leaq 8(%rax), %rdi + andq $-8, %rdi + movq %rcx, (%rax) + movl %r8d, %ecx + movq -8(%r11,%rcx), %rsi + movq %rsi, -8(%rax,%rcx) + movq %rax, %rcx + movq %r11, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r8d, %ecx + shrl $3, %ecx + rep movsq + jmp .L1247 +.L1451: + movl (%r12), %ecx + jmp .L1220 +.L1456: + movzbl (%r12), %esi + movb %sil, (%r11) + testb $2, %cl + je .L1251 + movzwl -2(%r12,%rcx), %esi + movw %si, -2(%r11,%rcx) + jmp .L1251 +.L1454: + movzbl (%r11), %esi + movb %sil, (%rax) + testb $2, %cl + je .L1247 + movzwl -2(%r11,%rcx), %esi + movw %si, -2(%rax,%rcx) + jmp .L1247 +.L1449: + cmpq $1, %rdx + jbe .L1159 + leaq 256(%rdi), %rax + cmpq %rax, %rsi + jb .L1459 + movl $4, %esi + call _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + jmp .L1159 +.L1174: + movq -112(%rbp), %rax + movl $4, %edi + movdqu 0(%r13), %xmm0 + andl $3, %eax + pcmpeqd %xmm2, %xmm0 + subq %rax, %rdi + movd %edi, %xmm5 + pshufd $0, %xmm5, %xmm1 + movdqa .LC0(%rip), %xmm5 + pcmpgtd %xmm5, %xmm1 + pandn %xmm1, %xmm0 + movmskps %xmm0, %eax + testl %eax, %eax + jne .L1460 + pxor %xmm1, %xmm1 + movq -72(%rbp), %r8 + leaq 256(%r13,%rdi,4), %rsi + pxor %xmm6, %xmm6 + movdqa %xmm1, %xmm0 + .p2align 4,,10 + .p2align 3 +.L1180: + movq %rdi, %rcx + leaq 64(%rdi), %rdi + cmpq %rdi, %r8 + jb .L1461 + leaq -256(%rsi), %rax +.L1179: + movdqa (%rax), %xmm4 + leaq 32(%rax), %rdx + pxor %xmm3, %xmm4 + por %xmm4, %xmm0 + movdqa 16(%rax), %xmm4 + pxor %xmm3, %xmm4 + por %xmm4, %xmm1 + movdqa 32(%rax), %xmm4 + pxor %xmm3, %xmm4 + por %xmm4, %xmm0 + movdqa 48(%rax), %xmm4 + pxor %xmm3, %xmm4 + por %xmm4, %xmm1 + movdqa 64(%rax), %xmm4 + pxor %xmm3, %xmm4 + por %xmm4, %xmm0 + movdqa 80(%rax), %xmm4 + leaq 96(%rdx), %rax + pxor %xmm3, %xmm4 + por %xmm4, %xmm1 + movdqa 64(%rdx), %xmm4 + pxor %xmm3, %xmm4 + por %xmm4, %xmm0 + movdqa 80(%rdx), %xmm4 + pxor %xmm3, %xmm4 + por %xmm4, %xmm1 + cmpq %rsi, %rax + jne .L1179 + movdqa %xmm0, %xmm4 + leaq 352(%rdx), %rsi + por %xmm1, %xmm4 + pcmpeqd %xmm6, %xmm4 + movmskps %xmm4, %eax + cmpl $15, %eax + je .L1180 + movdqa %xmm2, %xmm0 + pcmpeqd %xmm1, %xmm1 + pcmpeqd 0(%r13,%rcx,4), %xmm0 + pxor %xmm1, %xmm0 + movmskps %xmm0, %eax + testl %eax, %eax + jne .L1182 + .p2align 4,,10 + .p2align 3 +.L1181: + addq $4, %rcx + movdqa %xmm2, %xmm0 + pcmpeqd 0(%r13,%rcx,4), %xmm0 + pxor %xmm1, %xmm0 + movmskps %xmm0, %eax + testl %eax, %eax + je .L1181 +.L1182: + rep bsfl %eax, %eax + cltq + addq %rcx, %rax +.L1178: + leaq 0(%r13,%rax,4), %rdi + movdqa %xmm2, %xmm1 + movl (%rdi), %ecx + movd %ecx, %xmm6 + pshufd $0, %xmm6, %xmm3 + movdqa %xmm3, %xmm0 + movdqa %xmm3, %xmm8 + pcmpgtd %xmm2, %xmm0 + movmskps %xmm0, %edx + testl %edx, %edx + jne .L1187 + movq -72(%rbp), %rdi + xorl %esi, %esi + leaq -4(%rdi), %rax + jmp .L1196 + .p2align 4,,10 + .p2align 3 +.L1188: + movmskps %xmm0, %edx + movups %xmm1, 0(%r13,%rax,4) + popcntq %rdx, %rdx + addq %rdx, %rsi + leaq -4(%rax), %rdx + cmpq %rdx, %rdi + jbe .L1462 + movq %rdx, %rax +.L1196: + movdqu 0(%r13,%rax,4), %xmm0 + movdqu 0(%r13,%rax,4), %xmm4 + pcmpeqd %xmm2, %xmm0 + pcmpeqd %xmm3, %xmm4 + movdqa %xmm0, %xmm6 + movdqa %xmm0, %xmm7 + por %xmm4, %xmm6 + movmskps %xmm6, %edx + cmpl $15, %edx + je .L1188 + pcmpeqd %xmm0, %xmm0 + leaq 4(%rax), %rdi + pxor %xmm0, %xmm7 + pandn %xmm7, %xmm4 + movmskps %xmm4, %edx + rep bsfl %edx, %edx + movslq %edx, %rdx + addq %rax, %rdx + addq $8, %rax + movd 0(%r13,%rdx,4), %xmm6 + movq -72(%rbp), %rdx + pshufd $0, %xmm6, %xmm0 + subq %rsi, %rdx + movdqa %xmm0, %xmm4 + movaps %xmm0, -64(%rbp) + cmpq %rax, %rdx + jb .L1189 +.L1190: + movups %xmm8, -16(%r13,%rax,4) + movq %rax, %rdi + addq $4, %rax + cmpq %rdx, %rax + jbe .L1190 +.L1189: + subq %rdi, %rdx + leaq 0(,%rdi,4), %rsi + movq %rdx, %xmm0 + pshufd $0, %xmm0, %xmm0 + pcmpgtd %xmm5, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L1200 + movl %ecx, 0(%r13,%rdi,4) +.L1200: + pextrd $1, %xmm0, %eax + testl %eax, %eax + je .L1201 + movl %ecx, 4(%r13,%rsi) +.L1201: + pextrd $2, %xmm0, %eax + testl %eax, %eax + je .L1202 + movl %ecx, 8(%r13,%rsi) +.L1202: + pextrd $3, %xmm0, %eax + testl %eax, %eax + je .L1195 + movl %ecx, 12(%r13,%rsi) +.L1195: + movdqa %xmm2, %xmm0 + pcmpeqd .LC5(%rip), %xmm0 + movmskps %xmm0, %eax + cmpl $15, %eax + je .L1287 + movdqa %xmm2, %xmm0 + pcmpeqd .LC6(%rip), %xmm0 + movmskps %xmm0, %eax + cmpl $15, %eax + je .L1213 + movdqa %xmm3, %xmm5 + movdqa %xmm2, %xmm0 + pminsd %xmm4, %xmm5 + pcmpgtd %xmm5, %xmm0 + movmskps %xmm0, %eax + testl %eax, %eax + jne .L1463 + movdqa %xmm1, %xmm3 + movl $64, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1208: + leaq (%rcx,%rax,4), %rdx + addq $1, %rax + movdqu 0(%r13,%rdx,4), %xmm0 + pminsd %xmm3, %xmm0 + movdqa %xmm0, %xmm3 + cmpq $16, %rax + jne .L1208 + movdqa %xmm2, %xmm4 + pcmpgtd %xmm0, %xmm4 + movmskps %xmm4, %eax + testl %eax, %eax + jne .L1443 + leaq 64(%rsi), %rax + cmpq %rax, -72(%rbp) + jb .L1464 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1208 +.L1290: + movdqa .LC0(%rip), %xmm5 + xorl %r8d, %r8d + xorl %esi, %esi + movq %r13, %rax + leaq _ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %r9 + jmp .L1227 +.L1291: + xorl %ecx, %ecx + movq %r14, %rdi + jmp .L1254 +.L1292: + movq %rdx, %r8 + movq %rax, %r10 + movl $4, %ecx + leaq _ZZN3hwy6N_SSE46detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %rdi + jmp .L1255 +.L1452: + movq -72(%rbp), %rsi + leaq -1(%rsi), %rbx + movq %rbx, %r12 + shrq %r12 + .p2align 4,,10 + .p2align 3 +.L1225: + movq %r12, %rdx + movq %r13, %rdi + call _ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + subq $1, %r12 + jnb .L1225 + .p2align 4,,10 + .p2align 3 +.L1226: + movl 0(%r13,%rbx,4), %edx + movl 0(%r13), %eax + movq %rbx, %rsi + movq %r13, %rdi + movl %edx, 0(%r13) + xorl %edx, %edx + movl %eax, 0(%r13,%rbx,4) + call _ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + subq $1, %rbx + jne .L1226 + jmp .L1159 +.L1455: + movl (%r12), %esi + movl %esi, (%r11) + movl -4(%r12,%rcx), %esi + movl %esi, -4(%r11,%rcx) + jmp .L1251 +.L1453: + movl (%r11), %esi + movl %esi, (%rax) + movl -4(%r11,%rcx), %esi + movl %esi, -4(%rax,%rcx) + jmp .L1247 +.L1461: + movq -72(%rbp), %rsi + pcmpeqd %xmm1, %xmm1 +.L1184: + movq %rcx, %rdx + addq $4, %rcx + cmpq %rcx, %rsi + jb .L1465 + movdqa %xmm2, %xmm0 + pcmpeqd -16(%r13,%rcx,4), %xmm0 + pxor %xmm1, %xmm0 + movmskps %xmm0, %eax + testl %eax, %eax + je .L1184 +.L1440: + rep bsfl %eax, %eax + cltq + addq %rdx, %rax + jmp .L1178 +.L1187: + movq -72(%rbp), %rsi + leaq -64(%rbp), %rdx + movq %r12, %rcx + movdqa %xmm3, %xmm1 + movdqa %xmm2, %xmm0 + movaps %xmm3, -112(%rbp) + subq %rax, %rsi + call _ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L1159 + movd (%r12), %xmm5 + movdqa -64(%rbp), %xmm4 + movdqa -112(%rbp), %xmm3 + pshufd $0, %xmm5, %xmm2 + movdqa %xmm2, %xmm1 + jmp .L1195 +.L1462: + movd %eax, %xmm6 + movdqu 0(%r13), %xmm0 + movq -72(%rbp), %rdx + pshufd $0, %xmm6, %xmm4 + movdqu 0(%r13), %xmm6 + pcmpgtd %xmm5, %xmm4 + pcmpeqd %xmm2, %xmm0 + subq %rsi, %rdx + pcmpeqd %xmm3, %xmm6 + movdqa %xmm4, %xmm7 + pand %xmm0, %xmm7 + por %xmm6, %xmm0 + pcmpeqd %xmm6, %xmm6 + pxor %xmm6, %xmm4 + por %xmm4, %xmm0 + movmskps %xmm0, %edi + cmpl $15, %edi + jne .L1466 + movmskps %xmm7, %ecx + movq %rdx, %rax + movups %xmm1, 0(%r13) + popcntq %rcx, %rcx + subq %rcx, %rax + cmpq $3, %rax + jbe .L1276 + leaq -4(%rax), %rdx + movq -120(%rbp), %rsi + movq %rdx, %rcx + shrq $2, %rcx + salq $4, %rcx + leaq 16(%r13,%rcx), %rcx +.L1205: + movups %xmm8, (%rsi) + addq $16, %rsi + cmpq %rcx, %rsi + jne .L1205 + andq $-4, %rdx + addq $4, %rdx + leaq 0(,%rdx,4), %rcx + subq %rdx, %rax +.L1204: + movaps %xmm3, (%r12) + testq %rax, %rax + je .L1159 + leaq 0(%r13,%rcx), %rdi + leaq 0(,%rax,4), %rdx + movq %r12, %rsi + call memcpy@PLT + jmp .L1159 +.L1464: + movq -72(%rbp), %rdx + jmp .L1215 +.L1216: + movdqu -16(%r13,%rsi,4), %xmm5 + movdqa %xmm2, %xmm0 + pcmpgtd %xmm5, %xmm0 + movmskps %xmm0, %eax + testl %eax, %eax + jne .L1443 +.L1215: + movq %rsi, %rax + addq $4, %rsi + cmpq %rsi, %rdx + jnb .L1216 + movq -72(%rbp), %rdi + cmpq %rax, %rdi + je .L1287 + movdqu -16(%r13,%rdi,4), %xmm5 + movaps %xmm5, -112(%rbp) + pcmpgtd -112(%rbp), %xmm2 + movmskps %xmm2, %eax + cmpl $1, %eax + movl $1, %eax + adcl $0, %eax + movl %eax, -112(%rbp) + jmp .L1217 +.L1465: + movq -72(%rbp), %rax + pcmpeqd %xmm1, %xmm1 + leaq -4(%rax), %rdx + movdqu 0(%r13,%rdx,4), %xmm0 + pcmpeqd %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movmskps %xmm0, %eax + testl %eax, %eax + je .L1159 + jmp .L1440 + .p2align 4,,10 + .p2align 3 +.L1212: + movdqu -16(%r13,%rsi,4), %xmm0 + pcmpgtd %xmm2, %xmm0 + movmskps %xmm0, %eax + testl %eax, %eax + jne .L1443 +.L1211: + movq %rsi, %rax + addq $4, %rsi + cmpq %rsi, -72(%rbp) + jnb .L1212 + movq -72(%rbp), %rdi + cmpq %rax, %rdi + je .L1213 + movdqu -16(%r13,%rdi,4), %xmm5 + movaps %xmm5, -112(%rbp) + movdqa -112(%rbp), %xmm0 + pcmpgtd %xmm2, %xmm0 + movmskps %xmm0, %eax + testl %eax, %eax + jne .L1443 +.L1213: + pcmpeqd %xmm1, %xmm1 + movl $3, -112(%rbp) + paddd %xmm2, %xmm1 + jmp .L1217 +.L1287: + movl $2, -112(%rbp) + jmp .L1217 +.L1460: + rep bsfl %eax, %eax + cltq + jmp .L1178 +.L1459: + movq %rdx, %rcx + xorl %eax, %eax + cmpq $3, %rdx + jbe .L1165 + movq %rdx, %rbx + leaq -4(%rdx), %rdx + movq (%rdi), %rcx + movq %rdx, %rax + shrq $2, %rax + movq %rcx, (%r12) + addq $1, %rax + salq $4, %rax + movl %eax, %ecx + movq -8(%rdi,%rcx), %rsi + leaq 8(%r12), %rdi + andq $-8, %rdi + movq %rsi, -8(%r12,%rcx) + movq %r12, %rcx + movq %r13, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %eax, %ecx + movq %rdx, %rax + andq $-4, %rax + shrl $3, %ecx + rep movsq + addq $4, %rax + movq %rbx, %rcx + subq %rax, %rcx + je .L1168 +.L1165: + salq $2, %rax + leaq 0(,%rcx,4), %rdx + testq %rcx, %rcx + movl $4, %ecx + cmove %rcx, %rdx + leaq (%r12,%rax), %rdi + leaq 0(%r13,%rax), %rsi + call memcpy@PLT +.L1168: + movq -72(%rbp), %rdi + movl $32, %ecx + movl %edi, %eax + subl $1, %eax + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rax + shrq $4, %rax + movq %rax, %rsi + movl $1, %eax + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + addq $4, %rdx + cmpq %rdx, %rdi + jnb .L1167 + movdqa .LC4(%rip), %xmm0 + movq %rdi, %rax +.L1166: + movups %xmm0, (%r12,%rax,4) + addq $4, %rax + cmpq %rdx, %rax + jb .L1166 +.L1167: + movq %r12, %rdi + call _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + xorl %eax, %eax + cmpq $3, -72(%rbp) + jbe .L1170 + movq -72(%rbp), %rbx + movq (%r12), %rcx + leaq 8(%r13), %rdi + andq $-8, %rdi + leaq -4(%rbx), %rdx + movq %rcx, 0(%r13) + movq %rdx, %rax + shrq $2, %rax + addq $1, %rax + salq $4, %rax + movl %eax, %ecx + movq -8(%r12,%rcx), %rsi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + movq %r12, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %eax, %ecx + movq %rdx, %rax + andq $-4, %rax + shrl $3, %ecx + rep movsq + addq $4, %rax + subq %rax, %rbx + movq %rbx, -72(%rbp) + je .L1159 +.L1170: + movq -72(%rbp), %rsi + salq $2, %rax + movl $4, %ecx + leaq 0(%r13,%rax), %rdi + testq %rsi, %rsi + leaq 0(,%rsi,4), %rdx + leaq (%r12,%rax), %rsi + cmove %rcx, %rdx + call memcpy@PLT + jmp .L1159 +.L1463: + pmaxsd %xmm4, %xmm3 + movdqa %xmm3, %xmm0 + pcmpgtd %xmm2, %xmm0 + movmskps %xmm0, %eax + testl %eax, %eax + jne .L1443 + movdqa %xmm1, %xmm3 + movl $64, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1209: + leaq (%rcx,%rax,4), %rdx + addq $1, %rax + movdqu 0(%r13,%rdx,4), %xmm0 + pmaxsd %xmm3, %xmm0 + movdqa %xmm0, %xmm3 + cmpq $16, %rax + jne .L1209 + pcmpgtd %xmm2, %xmm0 + movmskps %xmm0, %eax + testl %eax, %eax + jne .L1443 + leaq 64(%rsi), %rax + cmpq %rax, -72(%rbp) + jb .L1211 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1209 +.L1466: + pxor %xmm6, %xmm0 + movmskps %xmm0, %esi + rep bsfl %esi, %esi + movslq %esi, %rsi + movd 0(%r13,%rsi,4), %xmm6 + leaq 4(%rax), %rsi + pshufd $0, %xmm6, %xmm0 + movdqa %xmm0, %xmm4 + movaps %xmm0, -64(%rbp) + cmpq %rdx, %rsi + ja .L1198 +.L1199: + movups %xmm8, -16(%r13,%rsi,4) + movq %rsi, %rax + addq $4, %rsi + cmpq %rdx, %rsi + jbe .L1199 +.L1198: + subq %rax, %rdx + leaq 0(,%rax,4), %rsi + movq %rdx, %xmm0 + pshufd $0, %xmm0, %xmm0 + pcmpgtd %xmm5, %xmm0 + movd %xmm0, %edx + testl %edx, %edx + je .L1200 + movl %ecx, 0(%r13,%rax,4) + jmp .L1200 +.L1276: + xorl %ecx, %ecx + jmp .L1204 + .cfi_endproc +.LFE18804: + .size _ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0, .-_ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + .section .text._ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0, @function +_ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0: +.LFB18806: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + pushq %r13 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + .cfi_offset 13, -40 + movq %rcx, %r13 + pushq %r12 + pushq %rbx + subq $360, %rsp + .cfi_offset 12, -48 + .cfi_offset 3, -56 + movq %rdi, -88(%rbp) + movq %rsi, -240(%rbp) + movq %rdx, -176(%rbp) + movq %r8, -184(%rbp) + movq %r9, -192(%rbp) + cmpq $64, %rdx + jbe .L1753 + movq %rdi, %r15 + movq %rdi, %r10 + shrq $2, %r15 + movq %r15, %rax + andl $15, %eax + jne .L1754 + movq %rdx, %rbx + movq %rdi, %r14 + movq %r8, %rax +.L1480: + movq 8(%rax), %rdx + movq 16(%rax), %r9 + movq %rdx, %rcx + leaq 1(%r9), %rdi + movq %rdx, %rsi + xorq (%rax), %rdi + rolq $24, %rcx + shrq $11, %rsi + movq %rcx, %rax + leaq (%rdx,%rdx,8), %rcx + leaq 2(%r9), %rdx + addq %rdi, %rax + xorq %rsi, %rcx + movq %rax, %rsi + xorq %rdx, %rcx + leaq (%rax,%rax,8), %rdx + movq %rax, %r8 + rolq $24, %rsi + shrq $11, %r8 + leaq 3(%r9), %rax + addq %rcx, %rsi + xorq %r8, %rdx + movq %rsi, %r8 + xorq %rax, %rdx + leaq (%rsi,%rsi,8), %rax + rolq $24, %r8 + movq %r8, %r11 + movq %rsi, %r8 + leaq 4(%r9), %rsi + addq $5, %r9 + addq %rdx, %r11 + shrq $11, %r8 + xorq %r8, %rax + movq %r11, %r12 + movq %r11, %r8 + shrq $11, %r12 + xorq %rsi, %rax + rolq $24, %r8 + leaq (%r11,%r11,8), %rsi + addq %rax, %r8 + xorq %r12, %rsi + xorq %r9, %rsi + leaq (%r8,%r8,8), %r11 + movq %r8, %r12 + rolq $24, %r8 + addq %rsi, %r8 + shrq $11, %r12 + movl %esi, %esi + xorq %r12, %r11 + movq %r8, %xmm6 + movq %rbx, %r12 + movabsq $68719476719, %r8 + movq %r11, %xmm0 + shrq $4, %r12 + movq -184(%rbp), %r11 + cmpq %r8, %rbx + movl $4294967295, %r8d + punpcklqdq %xmm6, %xmm0 + movl %edi, %ebx + cmova %r8, %r12 + movq %r9, 16(%r11) + shrq $32, %rdi + movl %edx, %r9d + movl %eax, %r8d + shrq $32, %rdx + movups %xmm0, (%r11) + movl %ecx, %r11d + shrq $32, %rcx + imulq %r12, %rbx + shrq $32, %rax + imulq %r12, %rdi + imulq %r12, %r11 + imulq %r12, %rcx + shrq $32, %rbx + imulq %r12, %r9 + shrq $32, %rdi + salq $6, %rbx + imulq %r12, %rdx + shrq $32, %r11 + salq $6, %rdi + addq %r14, %rbx + imulq %r12, %r8 + shrq $32, %rcx + salq $6, %r11 + addq %r14, %rdi + shrq $32, %r9 + salq $6, %rcx + addq %r14, %r11 + shrq $32, %rdx + salq $6, %r9 + addq %r14, %rcx + shrq $32, %r8 + salq $6, %rdx + addq %r14, %r9 + salq $6, %r8 + addq %r14, %rdx + addq %r14, %r8 + imulq %r12, %rax + imulq %r12, %rsi + shrq $32, %rax + shrq $32, %rsi + salq $6, %rax + salq $6, %rsi + addq %r14, %rax + leaq (%r14,%rsi), %r12 + xorl %esi, %esi +.L1482: + movdqa (%r11,%rsi,4), %xmm0 + movdqa (%rbx,%rsi,4), %xmm3 + movdqa %xmm0, %xmm1 + movdqa %xmm3, %xmm2 + pcmpgtd %xmm3, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm2 + pandn %xmm0, %xmm4 + pand %xmm1, %xmm0 + por %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + movdqa %xmm0, %xmm1 + movdqa (%rdi,%rsi,4), %xmm0 + pandn %xmm3, %xmm4 + pcmpgtd %xmm2, %xmm0 + por %xmm4, %xmm1 + movdqa %xmm0, %xmm3 + pand (%rdi,%rsi,4), %xmm0 + pandn %xmm2, %xmm3 + movdqa %xmm1, %xmm2 + por %xmm3, %xmm0 + pcmpgtd %xmm0, %xmm2 + movdqa %xmm2, %xmm3 + pand %xmm2, %xmm0 + pandn %xmm1, %xmm3 + por %xmm3, %xmm0 + movdqa (%rcx,%rsi,4), %xmm3 + movaps %xmm0, 0(%r13,%rsi,4) + movdqa (%rdx,%rsi,4), %xmm0 + movdqa %xmm3, %xmm2 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm3, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm2 + pandn %xmm0, %xmm4 + pand %xmm1, %xmm0 + por %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + movdqa %xmm0, %xmm1 + movdqa (%r9,%rsi,4), %xmm0 + pandn %xmm3, %xmm4 + pcmpgtd %xmm2, %xmm0 + por %xmm4, %xmm1 + movdqa %xmm0, %xmm3 + pand (%r9,%rsi,4), %xmm0 + pandn %xmm2, %xmm3 + movdqa %xmm1, %xmm2 + por %xmm3, %xmm0 + pcmpgtd %xmm0, %xmm2 + movdqa %xmm2, %xmm3 + pand %xmm2, %xmm0 + pandn %xmm1, %xmm3 + por %xmm3, %xmm0 + movdqa (%r8,%rsi,4), %xmm3 + movaps %xmm0, 64(%r13,%rsi,4) + movdqa (%r12,%rsi,4), %xmm0 + movdqa %xmm3, %xmm2 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm3, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm2 + pandn %xmm0, %xmm4 + pand %xmm1, %xmm0 + por %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + movdqa %xmm0, %xmm1 + movdqa (%rax,%rsi,4), %xmm0 + pandn %xmm3, %xmm4 + pcmpgtd %xmm2, %xmm0 + por %xmm4, %xmm1 + movdqa %xmm0, %xmm3 + pand (%rax,%rsi,4), %xmm0 + pandn %xmm2, %xmm3 + movdqa %xmm1, %xmm2 + por %xmm3, %xmm0 + pcmpgtd %xmm0, %xmm2 + movdqa %xmm2, %xmm3 + pand %xmm2, %xmm0 + pandn %xmm1, %xmm3 + por %xmm3, %xmm0 + movaps %xmm0, 128(%r13,%rsi,4) + addq $4, %rsi + cmpq $16, %rsi + jne .L1482 + movd 0(%r13), %xmm6 + movdqa 16(%r13), %xmm1 + movdqa 0(%r13), %xmm3 + pshufd $0, %xmm6, %xmm0 + pxor %xmm0, %xmm3 + pxor %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + por %xmm3, %xmm1 + movdqa 32(%r13), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 48(%r13), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 64(%r13), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 80(%r13), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 96(%r13), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 112(%r13), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 128(%r13), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 144(%r13), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 160(%r13), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + movdqa 176(%r13), %xmm3 + pxor %xmm0, %xmm3 + por %xmm3, %xmm1 + pxor %xmm3, %xmm3 + pcmpeqd %xmm3, %xmm1 + movmskps %xmm1, %eax + cmpl $15, %eax + je .L1483 + movdqa .LC4(%rip), %xmm0 + movl $4, %esi + movq %r13, %rdi + leaq 192(%r13), %r12 + movups %xmm0, 192(%r13) + movups %xmm0, 208(%r13) + movups %xmm0, 224(%r13) + movups %xmm0, 240(%r13) + movups %xmm0, 256(%r13) + call _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + movd 0(%r13), %xmm6 + pcmpeqd %xmm2, %xmm2 + pshufd $0, %xmm6, %xmm0 + movd 188(%r13), %xmm6 + pshufd $0, %xmm6, %xmm1 + paddd %xmm1, %xmm2 + pcmpeqd %xmm0, %xmm2 + movmskps %xmm2, %eax + cmpl $15, %eax + jne .L1485 + movq -176(%rbp), %rsi + movq -88(%rbp), %rdi + leaq -64(%rbp), %rdx + movq %r12, %rcx + call _ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L1467 +.L1485: + movl 96(%r13), %ecx + movl $23, %eax + movl $24, %edx + cmpl %ecx, 92(%r13) + je .L1527 + jmp .L1532 + .p2align 4,,10 + .p2align 3 +.L1530: + testq %rax, %rax + je .L1755 +.L1527: + movq %rax, %rdx + subq $1, %rax + movl 0(%r13,%rax,4), %esi + cmpl %ecx, %esi + je .L1530 + cmpl %ecx, 0(%r13,%rdx,4) + je .L1532 + movl %esi, %ecx + jmp .L1529 + .p2align 4,,10 + .p2align 3 +.L1533: + cmpq $47, %rdx + je .L1751 +.L1532: + movq %rdx, %rsi + addq $1, %rdx + cmpl %ecx, 0(%r13,%rdx,4) + je .L1533 + movl $24, %edx + subq $23, %rsi + subq %rax, %rdx + cmpq %rdx, %rsi + jb .L1529 +.L1751: + movl 0(%r13,%rax,4), %ecx +.L1529: + movd %ecx, %xmm6 + pshufd $0, %xmm6, %xmm3 +.L1752: + movl $1, -228(%rbp) +.L1526: + cmpq $0, -192(%rbp) + je .L1756 + movq -176(%rbp), %rax + movq -88(%rbp), %r14 + movaps %xmm3, -80(%rbp) + subq $4, %rax + movdqu (%r14,%rax,4), %xmm6 + movq %rax, %rbx + movq %rax, -112(%rbp) + andl $15, %ebx + movaps %xmm6, -224(%rbp) + andl $12, %eax + je .L1599 + movdqu (%r14), %xmm1 + pcmpeqd %xmm0, %xmm0 + movdqa %xmm1, %xmm4 + movaps %xmm1, -144(%rbp) + pcmpgtd %xmm3, %xmm4 + pxor %xmm4, %xmm0 + movaps %xmm4, -128(%rbp) + movmskps %xmm0, %r12d + movq %r12, %rdi + salq $4, %r12 + call __popcountdi2@PLT + movdqa -144(%rbp), %xmm1 + movdqa -128(%rbp), %xmm4 + leaq _ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %rcx + movd %eax, %xmm6 + movslq %eax, %rdx + movq %rcx, -168(%rbp) + pshufd $0, %xmm6, %xmm0 + movdqa .LC0(%rip), %xmm6 + movdqa %xmm1, %xmm2 + pshufb (%rcx,%r12), %xmm2 + pcmpgtd %xmm6, %xmm0 + movaps %xmm6, -208(%rbp) + movd %xmm0, %eax + testl %eax, %eax + je .L1537 + movd %xmm2, (%r14) +.L1537: + pshufd $85, %xmm0, %xmm3 + movd %xmm3, %eax + testl %eax, %eax + je .L1538 + movq -88(%rbp), %rax + pshufd $85, %xmm2, %xmm3 + movd %xmm3, 4(%rax) +.L1538: + movdqa %xmm0, %xmm3 + punpckhdq %xmm0, %xmm3 + movd %xmm3, %eax + testl %eax, %eax + je .L1539 + movq -88(%rbp), %rax + movdqa %xmm2, %xmm3 + punpckhdq %xmm2, %xmm3 + movd %xmm3, 8(%rax) +.L1539: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L1540 + movq -88(%rbp), %rax + pshufd $255, %xmm2, %xmm2 + movd %xmm2, 12(%rax) +.L1540: + movmskps %xmm4, %r15d + movq -88(%rbp), %rax + movaps %xmm1, -128(%rbp) + movq %r15, %rdi + salq $4, %r15 + leaq (%rax,%rdx,4), %r14 + call __popcountdi2@PLT + movq -168(%rbp), %rcx + movdqa -128(%rbp), %xmm1 + movslq %eax, %r12 + pshufb (%rcx,%r15), %xmm1 + movups %xmm1, 0(%r13) + testb $8, -112(%rbp) + je .L1541 + movq -88(%rbp), %rax + pcmpeqd %xmm0, %xmm0 + movdqu 16(%rax), %xmm1 + movdqa %xmm1, %xmm4 + movaps %xmm1, -144(%rbp) + pcmpgtd -80(%rbp), %xmm4 + pxor %xmm4, %xmm0 + movaps %xmm4, -128(%rbp) + movmskps %xmm0, %r15d + movq %r15, %rdi + salq $4, %r15 + call __popcountdi2@PLT + movdqa -144(%rbp), %xmm1 + movdqa -128(%rbp), %xmm4 + movd %eax, %xmm6 + movq -168(%rbp), %rsi + movslq %eax, %rcx + pshufd $0, %xmm6, %xmm0 + movdqa %xmm1, %xmm2 + pcmpgtd -208(%rbp), %xmm0 + pshufb (%rsi,%r15), %xmm2 + movd %xmm0, %eax + testl %eax, %eax + je .L1542 + movd %xmm2, (%r14) +.L1542: + pshufd $85, %xmm0, %xmm3 + movd %xmm3, %eax + testl %eax, %eax + je .L1543 + pshufd $85, %xmm2, %xmm3 + movd %xmm3, 4(%r14) +.L1543: + movdqa %xmm0, %xmm3 + punpckhdq %xmm0, %xmm3 + movd %xmm3, %eax + testl %eax, %eax + je .L1544 + movdqa %xmm2, %xmm3 + punpckhdq %xmm2, %xmm3 + movd %xmm3, 8(%r14) +.L1544: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L1545 + pshufd $255, %xmm2, %xmm2 + movd %xmm2, 12(%r14) +.L1545: + movmskps %xmm4, %r15d + movaps %xmm1, -128(%rbp) + leaq (%r14,%rcx,4), %r14 + movq %r15, %rdi + salq $4, %r15 + call __popcountdi2@PLT + movq -168(%rbp), %rcx + movdqa -128(%rbp), %xmm1 + cltq + pshufb (%rcx,%r15), %xmm1 + movups %xmm1, 0(%r13,%r12,4) + addq %rax, %r12 + cmpq $11, %rbx + jbe .L1541 + movq -88(%rbp), %rax + pcmpeqd %xmm0, %xmm0 + movdqu 32(%rax), %xmm1 + movdqa %xmm1, %xmm4 + movaps %xmm1, -144(%rbp) + pcmpgtd -80(%rbp), %xmm4 + pxor %xmm4, %xmm0 + movaps %xmm4, -128(%rbp) + movmskps %xmm0, %r15d + movq %r15, %rdi + salq $4, %r15 + call __popcountdi2@PLT + movdqa -144(%rbp), %xmm1 + movdqa -128(%rbp), %xmm4 + movd %eax, %xmm6 + movq -168(%rbp), %rsi + movslq %eax, %rcx + pshufd $0, %xmm6, %xmm0 + movdqa %xmm1, %xmm2 + pcmpgtd -208(%rbp), %xmm0 + pshufb (%rsi,%r15), %xmm2 + movd %xmm0, %eax + testl %eax, %eax + je .L1546 + movd %xmm2, (%r14) +.L1546: + pshufd $85, %xmm0, %xmm3 + movd %xmm3, %eax + testl %eax, %eax + je .L1547 + pshufd $85, %xmm2, %xmm3 + movd %xmm3, 4(%r14) +.L1547: + movdqa %xmm0, %xmm3 + punpckhdq %xmm0, %xmm3 + movd %xmm3, %eax + testl %eax, %eax + je .L1548 + movdqa %xmm2, %xmm3 + punpckhdq %xmm2, %xmm3 + movd %xmm3, 8(%r14) +.L1548: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L1549 + pshufd $255, %xmm2, %xmm2 + movd %xmm2, 12(%r14) +.L1549: + movmskps %xmm4, %r15d + movaps %xmm1, -128(%rbp) + leaq (%r14,%rcx,4), %r14 + movq %r15, %rdi + salq $4, %r15 + call __popcountdi2@PLT + movq -168(%rbp), %rcx + movdqa -128(%rbp), %xmm1 + cltq + pshufb (%rcx,%r15), %xmm1 + movups %xmm1, 0(%r13,%r12,4) + addq %rax, %r12 +.L1541: + leaq -4(%rbx), %rax + leaq 1(%rbx), %rcx + andq $-4, %rax + leaq 0(,%r12,4), %r15 + addq $4, %rax + cmpq $4, %rcx + movl $4, %ecx + cmovbe %rcx, %rax +.L1536: + cmpq %rax, %rbx + je .L1550 + subq %rax, %rbx + movd %ebx, %xmm6 + movq -88(%rbp), %rbx + pshufd $0, %xmm6, %xmm1 + movdqu (%rbx,%rax,4), %xmm2 + pcmpgtd -208(%rbp), %xmm1 + movdqa %xmm2, %xmm3 + movaps %xmm2, -160(%rbp) + pcmpgtd -80(%rbp), %xmm3 + movaps %xmm1, -128(%rbp) + movdqa %xmm3, %xmm0 + movaps %xmm3, -144(%rbp) + pandn %xmm1, %xmm0 + movmskps %xmm0, %ebx + movq %rbx, %rdi + salq $4, %rbx + call __popcountdi2@PLT + movdqa -160(%rbp), %xmm2 + movq -168(%rbp), %rsi + movd %eax, %xmm7 + movslq %eax, %rcx + movdqa -128(%rbp), %xmm1 + movdqa -144(%rbp), %xmm3 + pshufd $0, %xmm7, %xmm0 + movdqa %xmm2, %xmm4 + pcmpgtd -208(%rbp), %xmm0 + pshufb (%rsi,%rbx), %xmm4 + movd %xmm0, %eax + testl %eax, %eax + je .L1551 + movd %xmm4, (%r14) +.L1551: + pshufd $85, %xmm0, %xmm5 + movd %xmm5, %eax + testl %eax, %eax + je .L1552 + pshufd $85, %xmm4, %xmm5 + movd %xmm5, 4(%r14) +.L1552: + movdqa %xmm0, %xmm5 + punpckhdq %xmm0, %xmm5 + movd %xmm5, %eax + testl %eax, %eax + je .L1553 + movdqa %xmm4, %xmm5 + punpckhdq %xmm4, %xmm5 + movd %xmm5, 8(%r14) +.L1553: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L1554 + pshufd $255, %xmm4, %xmm4 + movd %xmm4, 12(%r14) +.L1554: + pand %xmm1, %xmm3 + movaps %xmm2, -128(%rbp) + leaq (%r14,%rcx,4), %r14 + movmskps %xmm3, %ebx + movq %rbx, %rdi + salq $4, %rbx + call __popcountdi2@PLT + movq -168(%rbp), %rcx + movdqa -128(%rbp), %xmm2 + cltq + pshufb (%rcx,%rbx), %xmm2 + addq %rax, %r12 + movups %xmm2, 0(%r13,%r15) + leaq 0(,%r12,4), %r15 +.L1550: + movq -112(%rbp), %rbx + movq -88(%rbp), %rax + movl %r15d, %ecx + subq %r12, %rbx + leaq (%rax,%rbx,4), %rax + cmpl $8, %r15d + jnb .L1555 + testb $4, %r15b + jne .L1757 + testl %ecx, %ecx + jne .L1758 +.L1556: + movl %r15d, %ecx + cmpl $8, %r15d + jnb .L1559 + andl $4, %r15d + jne .L1759 + testl %ecx, %ecx + jne .L1760 +.L1560: + movq -112(%rbp), %rcx + movq %r14, %rax + subq -88(%rbp), %rax + sarq $2, %rax + subq %rax, %rcx + subq %rax, %rbx + movq %rax, -256(%rbp) + movq %rcx, -248(%rbp) + leaq (%r14,%rbx,4), %rax + je .L1600 + movdqu (%r14), %xmm6 + movdqu -16(%rax), %xmm7 + leaq 64(%r14), %rsi + leaq -64(%rax), %r8 + movaps %xmm6, -272(%rbp) + movdqu 16(%r14), %xmm6 + movaps %xmm7, -384(%rbp) + movaps %xmm6, -288(%rbp) + movdqu 32(%r14), %xmm6 + movaps %xmm6, -304(%rbp) + movdqu 48(%r14), %xmm6 + movaps %xmm6, -320(%rbp) + movdqu -64(%rax), %xmm6 + movaps %xmm6, -336(%rbp) + movdqu -48(%rax), %xmm6 + movaps %xmm6, -352(%rbp) + movdqu -32(%rax), %xmm6 + movaps %xmm6, -368(%rbp) + cmpq %r8, %rsi + je .L1601 + movq %r13, -392(%rbp) + xorl %r15d, %r15d + movq %rsi, %r13 + leaq _ZZN3hwy7N_SSSE36detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %r12 + jmp .L1567 + .p2align 4,,10 + .p2align 3 +.L1762: + movdqu -64(%r8), %xmm3 + movdqu -48(%r8), %xmm2 + prefetcht0 -256(%r8) + subq $64, %r8 + movdqu 32(%r8), %xmm1 + movdqu 48(%r8), %xmm0 +.L1566: + movdqa %xmm3, %xmm4 + movq %r8, -96(%rbp) + pcmpgtd -80(%rbp), %xmm4 + movaps %xmm0, -160(%rbp) + movaps %xmm1, -144(%rbp) + movaps %xmm2, -128(%rbp) + movmskps %xmm4, %edi + movq %rdi, %rax + salq $4, %rax + pshufb (%r12,%rax), %xmm3 + movaps %xmm3, -112(%rbp) + call __popcountdi2@PLT + movdqa -112(%rbp), %xmm3 + movdqa -128(%rbp), %xmm2 + leaq -4(%rbx,%r15), %rdx + cltq + movups %xmm3, (%r14,%r15,4) + addq $4, %r15 + movups %xmm3, (%r14,%rdx,4) + movdqa %xmm2, %xmm3 + subq %rax, %r15 + pcmpgtd -80(%rbp), %xmm3 + movmskps %xmm3, %edi + movq %rdi, %rax + salq $4, %rax + pshufb (%r12,%rax), %xmm2 + movaps %xmm2, -112(%rbp) + call __popcountdi2@PLT + movdqa -112(%rbp), %xmm2 + leaq -8(%rbx,%r15), %rdx + movdqa -144(%rbp), %xmm1 + cltq + movups %xmm2, (%r14,%r15,4) + movups %xmm2, (%r14,%rdx,4) + movdqa %xmm1, %xmm2 + movl $4, %edx + pcmpgtd -80(%rbp), %xmm2 + subq %rax, %rdx + addq %rdx, %r15 + movmskps %xmm2, %edi + movq %rdi, %rax + salq $4, %rax + pshufb (%r12,%rax), %xmm1 + movaps %xmm1, -112(%rbp) + call __popcountdi2@PLT + movdqa -112(%rbp), %xmm1 + leaq -12(%rbx,%r15), %rdx + movdqa -160(%rbp), %xmm0 + cltq + subq $16, %rbx + movups %xmm1, (%r14,%r15,4) + movups %xmm1, (%r14,%rdx,4) + movdqa %xmm0, %xmm1 + movl $4, %edx + pcmpgtd -80(%rbp), %xmm1 + subq %rax, %rdx + addq %rdx, %r15 + movmskps %xmm1, %edi + movq %rdi, %rax + salq $4, %rax + pshufb (%r12,%rax), %xmm0 + movaps %xmm0, -112(%rbp) + call __popcountdi2@PLT + movdqa -112(%rbp), %xmm0 + movl $4, %edx + movq -96(%rbp), %r8 + cltq + leaq (%r15,%rbx), %rcx + subq %rax, %rdx + movups %xmm0, (%r14,%r15,4) + movups %xmm0, (%r14,%rcx,4) + addq %rdx, %r15 + cmpq %r8, %r13 + je .L1761 +.L1567: + movq %r13, %rax + subq %r14, %rax + sarq $2, %rax + subq %r15, %rax + cmpq $16, %rax + ja .L1762 + movdqu 0(%r13), %xmm3 + movdqu 16(%r13), %xmm2 + prefetcht0 256(%r13) + addq $64, %r13 + movdqu -32(%r13), %xmm1 + movdqu -16(%r13), %xmm0 + jmp .L1566 + .p2align 4,,10 + .p2align 3 +.L1754: + movq -176(%rbp), %rbx + movl $16, %edx + subq %rax, %rdx + leaq -16(%rax,%rbx), %rbx + leaq (%rdi,%rdx,4), %r14 + movq %r8, %rax + jmp .L1480 + .p2align 4,,10 + .p2align 3 +.L1761: + leaq (%rbx,%r15), %rax + movq -392(%rbp), %r13 + movq %rax, -128(%rbp) + leaq (%r14,%r15,4), %rax + addq $4, %r15 + movq %rax, -144(%rbp) +.L1564: + movdqa -272(%rbp), %xmm5 + movdqa %xmm5, %xmm0 + pcmpgtd -80(%rbp), %xmm0 + movmskps %xmm0, %edi + movdqa %xmm5, %xmm0 + movq %rdi, %rax + salq $4, %rax + pshufb (%r12,%rax), %xmm0 + movaps %xmm0, -112(%rbp) + call __popcountdi2@PLT + movdqa -112(%rbp), %xmm0 + movq -144(%rbp), %rcx + movdqa -288(%rbp), %xmm7 + cltq + movups %xmm0, (%rcx) + movq -128(%rbp), %rcx + subq %rax, %r15 + movups %xmm0, -16(%r14,%rcx,4) + movdqa %xmm7, %xmm0 + pcmpgtd -80(%rbp), %xmm0 + movmskps %xmm0, %edi + movdqa %xmm7, %xmm0 + movq %rdi, %rax + salq $4, %rax + pshufb (%r12,%rax), %xmm0 + movaps %xmm0, -112(%rbp) + call __popcountdi2@PLT + movdqa -112(%rbp), %xmm0 + leaq -8(%rbx,%r15), %rcx + movdqa -304(%rbp), %xmm5 + cltq + movups %xmm0, (%r14,%r15,4) + subq %rax, %r15 + movups %xmm0, (%r14,%rcx,4) + movdqa %xmm5, %xmm0 + addq $4, %r15 + pcmpgtd -80(%rbp), %xmm0 + movmskps %xmm0, %edi + movdqa %xmm5, %xmm0 + movq %rdi, %rax + salq $4, %rax + pshufb (%r12,%rax), %xmm0 + movaps %xmm0, -112(%rbp) + call __popcountdi2@PLT + movdqa -112(%rbp), %xmm0 + leaq -12(%rbx,%r15), %rcx + movdqa -320(%rbp), %xmm7 + cltq + movups %xmm0, (%r14,%r15,4) + movups %xmm0, (%r14,%rcx,4) + movdqa %xmm7, %xmm0 + movl $4, %ecx + pcmpgtd -80(%rbp), %xmm0 + subq %rax, %rcx + addq %rcx, %r15 + movmskps %xmm0, %edi + movdqa %xmm7, %xmm0 + movq %rdi, %rax + salq $4, %rax + pshufb (%r12,%rax), %xmm0 + movaps %xmm0, -112(%rbp) + call __popcountdi2@PLT + movdqa -112(%rbp), %xmm0 + leaq -16(%rbx,%r15), %rcx + movdqa -336(%rbp), %xmm5 + cltq + movups %xmm0, (%r14,%r15,4) + movups %xmm0, (%r14,%rcx,4) + movdqa %xmm5, %xmm0 + movl $4, %ecx + pcmpgtd -80(%rbp), %xmm0 + subq %rax, %rcx + addq %rcx, %r15 + movmskps %xmm0, %edi + movdqa %xmm5, %xmm0 + movq %rdi, %rax + salq $4, %rax + pshufb (%r12,%rax), %xmm0 + movaps %xmm0, -112(%rbp) + call __popcountdi2@PLT + movdqa -112(%rbp), %xmm0 + leaq -20(%rbx,%r15), %rcx + movdqa -352(%rbp), %xmm5 + cltq + movups %xmm0, (%r14,%r15,4) + movups %xmm0, (%r14,%rcx,4) + movdqa %xmm5, %xmm0 + movl $4, %ecx + pcmpgtd -80(%rbp), %xmm0 + subq %rax, %rcx + addq %rcx, %r15 + movmskps %xmm0, %edi + movdqa %xmm5, %xmm0 + movq %rdi, %rax + salq $4, %rax + pshufb (%r12,%rax), %xmm0 + movaps %xmm0, -112(%rbp) + call __popcountdi2@PLT + movdqa -112(%rbp), %xmm0 + leaq -24(%rbx,%r15), %rcx + movdqa -368(%rbp), %xmm5 + cltq + movups %xmm0, (%r14,%r15,4) + movups %xmm0, (%r14,%rcx,4) + movdqa %xmm5, %xmm0 + movl $4, %ecx + pcmpgtd -80(%rbp), %xmm0 + subq %rax, %rcx + addq %rcx, %r15 + movmskps %xmm0, %edi + movdqa %xmm5, %xmm0 + movq %rdi, %rax + salq $4, %rax + pshufb (%r12,%rax), %xmm0 + movaps %xmm0, -112(%rbp) + call __popcountdi2@PLT + movdqa -112(%rbp), %xmm0 + leaq -28(%rbx,%r15), %rcx + movdqa -384(%rbp), %xmm5 + cltq + movups %xmm0, (%r14,%r15,4) + movups %xmm0, (%r14,%rcx,4) + movdqa %xmm5, %xmm0 + movl $4, %ecx + pcmpgtd -80(%rbp), %xmm0 + subq %rax, %rcx + addq %rcx, %r15 + movmskps %xmm0, %edi + movdqa %xmm5, %xmm0 + movq %rdi, %rax + salq $4, %rax + pshufb (%r12,%rax), %xmm0 + movaps %xmm0, -112(%rbp) + call __popcountdi2@PLT + movdqa -112(%rbp), %xmm0 + leaq -32(%rbx,%r15), %rcx + movl $4, %edx + cltq + movups %xmm0, (%r14,%r15,4) + subq %rax, %rdx + movups %xmm0, (%r14,%rcx,4) + movq -248(%rbp), %rcx + leaq (%rdx,%r15), %rbx + leaq 0(,%rbx,4), %r15 + subq %rbx, %rcx +.L1563: + movq -248(%rbp), %rsi + movdqa -224(%rbp), %xmm3 + cmpq $4, %rcx + pcmpeqd %xmm0, %xmm0 + pcmpgtd -80(%rbp), %xmm3 + leaq -16(,%rsi,4), %rax + cmovnb %r15, %rax + pxor %xmm3, %xmm0 + movdqu (%r14,%rax), %xmm6 + movaps %xmm3, -80(%rbp) + movmskps %xmm0, %r12d + movups %xmm6, (%r14,%rsi,4) + movq %r12, %rdi + salq $4, %r12 + movaps %xmm6, -112(%rbp) + call __popcountdi2@PLT + movq -168(%rbp), %rsi + movdqa -80(%rbp), %xmm3 + movd %eax, %xmm6 + movdqa -224(%rbp), %xmm1 + movslq %eax, %rcx + pshufd $0, %xmm6, %xmm0 + pcmpgtd -208(%rbp), %xmm0 + pshufb (%rsi,%r12), %xmm1 + movd %xmm0, %eax + testl %eax, %eax + je .L1569 + movd %xmm1, (%r14,%r15) +.L1569: + pshufd $85, %xmm0, %xmm2 + movd %xmm2, %eax + testl %eax, %eax + je .L1570 + pshufd $85, %xmm1, %xmm2 + movd %xmm2, 4(%r14,%r15) +.L1570: + movdqa %xmm0, %xmm2 + punpckhdq %xmm0, %xmm2 + movd %xmm2, %eax + testl %eax, %eax + je .L1571 + movdqa %xmm1, %xmm2 + punpckhdq %xmm1, %xmm2 + movd %xmm2, 8(%r14,%r15) +.L1571: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L1572 + pshufd $255, %xmm1, %xmm1 + movd %xmm1, 12(%r14,%r15) +.L1572: + movmskps %xmm3, %r12d + addq %rcx, %rbx + movq %r12, %rdi + salq $4, %r12 + leaq 0(,%rbx,4), %r15 + call __popcountdi2@PLT + movq -168(%rbp), %rcx + movdqa -224(%rbp), %xmm1 + movd %eax, %xmm6 + pshufd $0, %xmm6, %xmm0 + pshufb (%rcx,%r12), %xmm1 + pcmpgtd -208(%rbp), %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L1573 + movd %xmm1, (%r14,%rbx,4) +.L1573: + pshufd $85, %xmm0, %xmm2 + movd %xmm2, %eax + testl %eax, %eax + je .L1574 + pshufd $85, %xmm1, %xmm2 + movd %xmm2, 4(%r14,%r15) +.L1574: + movdqa %xmm0, %xmm2 + punpckhdq %xmm0, %xmm2 + movd %xmm2, %eax + testl %eax, %eax + je .L1575 + movdqa %xmm1, %xmm2 + punpckhdq %xmm1, %xmm2 + movd %xmm2, 8(%r14,%r15) +.L1575: + pshufd $255, %xmm0, %xmm0 + movd %xmm0, %eax + testl %eax, %eax + je .L1576 + pshufd $255, %xmm1, %xmm1 + movd %xmm1, 12(%r14,%r15) +.L1576: + movq -192(%rbp), %r12 + addq -256(%rbp), %rbx + subq $1, %r12 + cmpl $2, -228(%rbp) + je .L1578 + movq -184(%rbp), %r8 + movq %r12, %r9 + movq %r13, %rcx + movq %rbx, %rdx + movq -240(%rbp), %rsi + movq -88(%rbp), %rdi + call _ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + cmpl $3, -228(%rbp) + je .L1467 +.L1578: + movq -176(%rbp), %rdx + movq -88(%rbp), %rax + movq %r12, %r9 + movq %r13, %rcx + movq -184(%rbp), %r8 + movq -240(%rbp), %rsi + subq %rbx, %rdx + leaq (%rax,%rbx,4), %rdi + call _ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1467: + addq $360, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1559: + .cfi_restore_state + movq 0(%r13), %rcx + leaq 8(%rax), %rdi + andq $-8, %rdi + movq %rcx, (%rax) + movl %r15d, %ecx + movq -8(%r13,%rcx), %rsi + movq %rsi, -8(%rax,%rcx) + subq %rdi, %rax + movq %r13, %rsi + leal (%r15,%rax), %ecx + subq %rax, %rsi + shrl $3, %ecx + rep movsq + jmp .L1560 + .p2align 4,,10 + .p2align 3 +.L1555: + movq (%rax), %rcx + leaq 8(%r14), %rdi + andq $-8, %rdi + movq %rcx, (%r14) + movl %r15d, %ecx + movq -8(%rax,%rcx), %rsi + movq %rsi, -8(%r14,%rcx) + movq %r14, %rcx + movq %rax, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r15d, %ecx + shrl $3, %ecx + rep movsq + jmp .L1556 +.L1755: + movl 0(%r13), %ecx + jmp .L1529 +.L1760: + movzbl 0(%r13), %esi + movb %sil, (%rax) + testb $2, %cl + je .L1560 + movzwl -2(%r13,%rcx), %esi + movw %si, -2(%rax,%rcx) + jmp .L1560 +.L1758: + movzbl (%rax), %esi + movb %sil, (%r14) + testb $2, %cl + je .L1556 + movzwl -2(%rax,%rcx), %esi + movw %si, -2(%r14,%rcx) + jmp .L1556 +.L1753: + cmpq $1, %rdx + jbe .L1467 + movq %rdi, %rax + addq $256, %rax + cmpq %rax, %rsi + jb .L1763 + movl $4, %esi + call _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + jmp .L1467 +.L1483: + movq -88(%rbp), %rax + andl $3, %r15d + movl $4, %edi + movdqa .LC0(%rip), %xmm7 + subq %r15, %rdi + movdqu (%rax), %xmm5 + movaps %xmm7, -208(%rbp) + movaps %xmm5, -80(%rbp) + movd %edi, %xmm5 + movdqa -80(%rbp), %xmm1 + pshufd $0, %xmm5, %xmm3 + pcmpeqd %xmm0, %xmm1 + pcmpgtd %xmm7, %xmm3 + pandn %xmm3, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + jne .L1764 + movq -88(%rbp), %rax + pxor %xmm3, %xmm3 + movq -176(%rbp), %r8 + pxor %xmm5, %xmm5 + movdqa %xmm3, %xmm1 + leaq 256(%rax,%rdi,4), %rsi + .p2align 4,,10 + .p2align 3 +.L1489: + movq %rdi, %rcx + leaq 64(%rdi), %rdi + cmpq %rdi, %r8 + jb .L1765 + leaq -256(%rsi), %rax +.L1488: + movdqa (%rax), %xmm4 + leaq 32(%rax), %rdx + pxor %xmm2, %xmm4 + por %xmm4, %xmm1 + movdqa 16(%rax), %xmm4 + pxor %xmm2, %xmm4 + por %xmm4, %xmm3 + movdqa 32(%rax), %xmm4 + pxor %xmm2, %xmm4 + por %xmm4, %xmm1 + movdqa 48(%rax), %xmm4 + pxor %xmm2, %xmm4 + por %xmm4, %xmm3 + movdqa 64(%rax), %xmm4 + pxor %xmm2, %xmm4 + por %xmm4, %xmm1 + movdqa 80(%rax), %xmm4 + leaq 96(%rdx), %rax + pxor %xmm2, %xmm4 + por %xmm4, %xmm3 + movdqa 64(%rdx), %xmm4 + pxor %xmm2, %xmm4 + por %xmm4, %xmm1 + movdqa 80(%rdx), %xmm4 + pxor %xmm2, %xmm4 + por %xmm4, %xmm3 + cmpq %rsi, %rax + jne .L1488 + movdqa %xmm1, %xmm4 + leaq 352(%rdx), %rsi + por %xmm3, %xmm4 + pcmpeqd %xmm5, %xmm4 + movmskps %xmm4, %eax + cmpl $15, %eax + je .L1489 + movq -88(%rbp), %rax + movdqa %xmm0, %xmm1 + pcmpeqd %xmm2, %xmm2 + movq -88(%rbp), %rdx + pcmpeqd (%rax,%rcx,4), %xmm1 + pxor %xmm2, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + jne .L1491 + .p2align 4,,10 + .p2align 3 +.L1490: + addq $4, %rcx + movdqa %xmm0, %xmm1 + pcmpeqd (%rdx,%rcx,4), %xmm1 + pxor %xmm2, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + je .L1490 +.L1491: + rep bsfl %eax, %eax + cltq + addq %rcx, %rax +.L1487: + movq -88(%rbp), %rcx + leaq (%rcx,%rax,4), %rdi + movl (%rdi), %r12d + movd %r12d, %xmm7 + pshufd $0, %xmm7, %xmm2 + movdqa %xmm2, %xmm1 + movaps %xmm2, -80(%rbp) + pcmpgtd %xmm0, %xmm1 + movmskps %xmm1, %edx + testl %edx, %edx + jne .L1496 + movq -176(%rbp), %r15 + movl %r12d, -160(%rbp) + xorl %ebx, %ebx + movq %r10, -96(%rbp) + leaq -4(%r15), %r14 + movaps %xmm0, -144(%rbp) + movq %r14, %r12 + movaps %xmm0, -112(%rbp) + movq %r13, %r14 + movq %rcx, %r13 + movaps %xmm2, -128(%rbp) + jmp .L1505 + .p2align 4,,10 + .p2align 3 +.L1497: + movdqa -144(%rbp), %xmm7 + movmskps %xmm1, %edi + movups %xmm7, 0(%r13,%r12,4) + call __popcountdi2@PLT + cltq + addq %rax, %rbx + leaq -4(%r12), %rax + cmpq %rax, %r15 + jbe .L1766 + movq %rax, %r12 +.L1505: + movdqu 0(%r13,%r12,4), %xmm1 + movdqu 0(%r13,%r12,4), %xmm4 + pcmpeqd -112(%rbp), %xmm1 + pcmpeqd -128(%rbp), %xmm4 + movdqa %xmm1, %xmm5 + movdqa %xmm1, %xmm6 + por %xmm4, %xmm5 + movmskps %xmm5, %eax + cmpl $15, %eax + je .L1497 + pcmpeqd %xmm1, %xmm1 + movq -88(%rbp), %rsi + movq %r14, %r13 + movq %r12, %r14 + pxor %xmm1, %xmm6 + movq -176(%rbp), %rdx + movdqa -112(%rbp), %xmm0 + leaq 4(%r14), %rcx + pandn %xmm6, %xmm4 + movdqa -144(%rbp), %xmm3 + movdqa -128(%rbp), %xmm2 + movmskps %xmm4, %eax + subq %rbx, %rdx + movl -160(%rbp), %r12d + rep bsfl %eax, %eax + cltq + addq %r14, %rax + movd (%rsi,%rax,4), %xmm7 + leaq 8(%r14), %rax + pshufd $0, %xmm7, %xmm1 + movdqa %xmm1, %xmm4 + movaps %xmm1, -64(%rbp) + cmpq %rax, %rdx + jb .L1498 +.L1499: + movdqa -80(%rbp), %xmm6 + movq %rax, %rcx + movups %xmm6, -16(%rsi,%rax,4) + addq $4, %rax + cmpq %rax, %rdx + jnb .L1499 +.L1498: + subq %rcx, %rdx + leaq 0(,%rcx,4), %rsi + movq %rdx, %xmm1 + pshufd $0, %xmm1, %xmm1 + pcmpgtd -208(%rbp), %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L1500 + movq -88(%rbp), %rax + movl %r12d, (%rax,%rcx,4) +.L1500: + pshufd $85, %xmm1, %xmm5 + movd %xmm5, %eax + testl %eax, %eax + je .L1501 + movq -88(%rbp), %rax + pshufd $85, %xmm2, %xmm5 + movd %xmm5, 4(%rax,%rsi) +.L1501: + movdqa %xmm1, %xmm5 + punpckhdq %xmm1, %xmm5 + movd %xmm5, %eax + testl %eax, %eax + je .L1502 + movq -88(%rbp), %rax + movdqa %xmm2, %xmm5 + punpckhdq %xmm2, %xmm5 + movd %xmm5, 8(%rax,%rsi) +.L1502: + pshufd $255, %xmm1, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L1504 + movq -88(%rbp), %rax + pshufd $255, %xmm2, %xmm1 + movd %xmm1, 12(%rax,%rsi) +.L1504: + movdqa %xmm0, %xmm1 + pcmpeqd .LC5(%rip), %xmm1 + movmskps %xmm1, %eax + cmpl $15, %eax + je .L1596 + movdqa %xmm0, %xmm1 + pcmpeqd .LC6(%rip), %xmm1 + movmskps %xmm1, %eax + cmpl $15, %eax + je .L1522 + movdqa %xmm4, %xmm1 + pcmpgtd %xmm2, %xmm1 + movdqa %xmm1, %xmm6 + movdqa %xmm1, %xmm5 + pandn %xmm4, %xmm6 + pand %xmm2, %xmm5 + por %xmm6, %xmm5 + movdqa %xmm0, %xmm6 + pcmpgtd %xmm5, %xmm6 + movmskps %xmm6, %eax + testl %eax, %eax + jne .L1767 + movdqa %xmm3, %xmm1 + movl $64, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1517: + movq -88(%rbp), %rbx + leaq (%rcx,%rax,4), %rdx + addq $1, %rax + movdqu (%rbx,%rdx,4), %xmm4 + movdqa %xmm4, %xmm2 + pcmpgtd %xmm1, %xmm2 + movdqa %xmm2, %xmm5 + pand %xmm2, %xmm1 + pandn %xmm4, %xmm5 + por %xmm5, %xmm1 + cmpq $16, %rax + jne .L1517 + movdqa %xmm0, %xmm2 + pcmpgtd %xmm1, %xmm2 + movmskps %xmm2, %eax + testl %eax, %eax + jne .L1752 + leaq 64(%rsi), %rax + cmpq %rax, -176(%rbp) + jb .L1768 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1517 +.L1599: + movdqa .LC0(%rip), %xmm7 + leaq _ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %rcx + xorl %r15d, %r15d + xorl %r12d, %r12d + movq %rcx, -168(%rbp) + movaps %xmm7, -208(%rbp) + jmp .L1536 +.L1600: + xorl %r15d, %r15d + jmp .L1563 +.L1601: + movq %r14, -144(%rbp) + movl $4, %r15d + leaq _ZZN3hwy7N_SSSE36detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %r12 + movq %rbx, -128(%rbp) + jmp .L1564 +.L1756: + movq -176(%rbp), %rsi + movq -88(%rbp), %rdi + leaq -1(%rsi), %rbx + movq %rbx, %r12 + shrq %r12 + .p2align 4,,10 + .p2align 3 +.L1534: + movq %r12, %rdx + call _ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + subq $1, %r12 + jnb .L1534 + .p2align 4,,10 + .p2align 3 +.L1535: + movl (%rdi,%rbx,4), %edx + movl (%rdi), %eax + movq %rbx, %rsi + movl %edx, (%rdi) + xorl %edx, %edx + movl %eax, (%rdi,%rbx,4) + call _ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + subq $1, %rbx + jne .L1535 + jmp .L1467 +.L1759: + movl 0(%r13), %esi + movl %esi, (%rax) + movl -4(%r13,%rcx), %esi + movl %esi, -4(%rax,%rcx) + jmp .L1560 +.L1757: + movl (%rax), %esi + movl %esi, (%r14) + movl -4(%rax,%rcx), %esi + movl %esi, -4(%r14,%rcx) + jmp .L1556 +.L1765: + movq -88(%rbp), %rsi + movq -176(%rbp), %rdi + pcmpeqd %xmm2, %xmm2 +.L1493: + movq %rcx, %rdx + addq $4, %rcx + cmpq %rcx, %rdi + jb .L1769 + movdqa %xmm0, %xmm1 + pcmpeqd -16(%rsi,%rcx,4), %xmm1 + pxor %xmm2, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + je .L1493 +.L1750: + rep bsfl %eax, %eax + cltq + addq %rdx, %rax + jmp .L1487 +.L1496: + leaq -64(%rbp), %rdx + movq %r13, %rcx + movdqa %xmm2, %xmm1 + movaps %xmm2, -80(%rbp) + movq -176(%rbp), %rsi + subq %rax, %rsi + call _ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L1467 + movd 0(%r13), %xmm7 + movdqa -64(%rbp), %xmm4 + movdqa -80(%rbp), %xmm2 + pshufd $0, %xmm7, %xmm0 + movdqa %xmm0, %xmm3 + jmp .L1504 +.L1766: + movq -88(%rbp), %rax + movq %r14, %r13 + movq %r12, %r14 + movdqa -112(%rbp), %xmm0 + movd %r14d, %xmm6 + movdqa -128(%rbp), %xmm2 + movq -96(%rbp), %r10 + movdqu (%rax), %xmm7 + pshufd $0, %xmm6, %xmm4 + movq -176(%rbp), %r15 + pcmpgtd -208(%rbp), %xmm4 + movdqa -144(%rbp), %xmm3 + movl -160(%rbp), %r12d + movaps %xmm7, -112(%rbp) + movdqa -112(%rbp), %xmm1 + movdqa -112(%rbp), %xmm5 + subq %rbx, %r15 + pcmpeqd %xmm0, %xmm1 + pcmpeqd %xmm2, %xmm5 + movdqa %xmm4, %xmm6 + pand %xmm1, %xmm6 + por %xmm5, %xmm1 + pcmpeqd %xmm5, %xmm5 + pxor %xmm5, %xmm4 + por %xmm4, %xmm1 + movmskps %xmm1, %eax + cmpl $15, %eax + jne .L1770 + movmskps %xmm6, %edi + movaps %xmm2, -128(%rbp) + movaps %xmm3, -112(%rbp) + movq %r10, -144(%rbp) + call __popcountdi2@PLT + movq -88(%rbp), %rbx + movdqa -112(%rbp), %xmm3 + movslq %eax, %rdx + movq %r15, %rax + movdqa -128(%rbp), %xmm2 + subq %rdx, %rax + movups %xmm3, (%rbx) + cmpq $3, %rax + jbe .L1585 + leaq -4(%rax), %rdx + movq -144(%rbp), %r10 + movq %rdx, %rcx + shrq $2, %rcx + salq $4, %rcx + leaq 16(%rbx,%rcx), %rcx +.L1514: + movdqa -80(%rbp), %xmm7 + addq $16, %r10 + movups %xmm7, -16(%r10) + cmpq %rcx, %r10 + jne .L1514 + andq $-4, %rdx + addq $4, %rdx + leaq 0(,%rdx,4), %rcx + subq %rdx, %rax +.L1513: + movaps %xmm2, 0(%r13) + testq %rax, %rax + je .L1467 + movq -88(%rbp), %rdi + leaq 0(,%rax,4), %rdx + movq %r13, %rsi + addq %rcx, %rdi + call memcpy@PLT + jmp .L1467 +.L1768: + movq -176(%rbp), %rcx + movq %rbx, %rdx + jmp .L1524 +.L1525: + movdqu -16(%rdx,%rsi,4), %xmm6 + movdqa %xmm0, %xmm1 + pcmpgtd %xmm6, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + jne .L1752 +.L1524: + movq %rsi, %rax + addq $4, %rsi + cmpq %rsi, %rcx + jnb .L1525 + movq -176(%rbp), %rbx + cmpq %rax, %rbx + je .L1596 + movq -88(%rbp), %rax + movdqu -16(%rax,%rbx,4), %xmm7 + movaps %xmm7, -80(%rbp) + pcmpgtd -80(%rbp), %xmm0 + movmskps %xmm0, %eax + cmpl $1, %eax + movl $1, %eax + adcl $0, %eax + movl %eax, -228(%rbp) + jmp .L1526 +.L1769: + movq -176(%rbp), %rax + pcmpeqd %xmm2, %xmm2 + leaq -4(%rax), %rdx + movq -88(%rbp), %rax + movdqu (%rax,%rdx,4), %xmm6 + movaps %xmm6, -80(%rbp) + movdqa -80(%rbp), %xmm1 + pcmpeqd %xmm0, %xmm1 + pxor %xmm2, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + je .L1467 + jmp .L1750 +.L1771: + movq %rbx, %rdx + jmp .L1520 +.L1521: + movdqu -16(%rdx,%rsi,4), %xmm1 + pcmpgtd %xmm0, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + jne .L1752 +.L1520: + movq %rsi, %rax + addq $4, %rsi + cmpq %rsi, -176(%rbp) + jnb .L1521 + movq -176(%rbp), %rbx + cmpq %rax, %rbx + je .L1522 + movq -88(%rbp), %rax + movdqu -16(%rax,%rbx,4), %xmm6 + movaps %xmm6, -80(%rbp) + movdqa -80(%rbp), %xmm1 + pcmpgtd %xmm0, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + jne .L1752 +.L1522: + movl $3, -228(%rbp) + pcmpeqd %xmm3, %xmm3 + paddd %xmm0, %xmm3 + jmp .L1526 +.L1596: + movl $2, -228(%rbp) + jmp .L1526 +.L1764: + rep bsfl %eax, %eax + cltq + jmp .L1487 +.L1763: + movq %rdx, %rcx + xorl %eax, %eax + cmpq $3, %rdx + jbe .L1473 + movq %rdx, %r10 + leaq -4(%rdx), %rdx + movq (%rdi), %rcx + movq %rdi, %rbx + movq %rdx, %rax + shrq $2, %rax + movq %rcx, 0(%r13) + addq $1, %rax + salq $4, %rax + movl %eax, %ecx + movq -8(%rdi,%rcx), %rsi + leaq 8(%r13), %rdi + andq $-8, %rdi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + subq %rdi, %rcx + subq %rcx, %rbx + addl %eax, %ecx + movq %rdx, %rax + andq $-4, %rax + movq %rbx, %rsi + shrl $3, %ecx + rep movsq + addq $4, %rax + movq %r10, %rcx + subq %rax, %rcx + je .L1476 +.L1473: + salq $2, %rax + leaq 0(,%rcx,4), %rdx + testq %rcx, %rcx + movq -88(%rbp), %rbx + movl $4, %ecx + leaq 0(%r13,%rax), %rdi + cmove %rcx, %rdx + leaq (%rbx,%rax), %rsi + call memcpy@PLT +.L1476: + movq -176(%rbp), %rbx + movl $32, %ecx + movl %ebx, %eax + subl $1, %eax + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rax + shrq $4, %rax + movq %rax, %rsi + movl $1, %eax + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + addq $4, %rdx + cmpq %rdx, %rbx + jnb .L1475 + movdqa .LC4(%rip), %xmm0 + movq %rbx, %rax +.L1474: + movups %xmm0, 0(%r13,%rax,4) + addq $4, %rax + cmpq %rdx, %rax + jb .L1474 +.L1475: + movq %r13, %rdi + call _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + xorl %eax, %eax + cmpq $3, -176(%rbp) + jbe .L1478 + movq -176(%rbp), %r10 + movq 0(%r13), %rcx + movq -88(%rbp), %rbx + leaq -4(%r10), %rdx + movq %rdx, %rax + movq %rcx, (%rbx) + leaq 8(%rbx), %rdi + shrq $2, %rax + andq $-8, %rdi + addq $1, %rax + salq $4, %rax + movl %eax, %ecx + movq -8(%r13,%rcx), %rsi + movq %rsi, -8(%rbx,%rcx) + subq %rdi, %rbx + movq %r13, %rsi + movq %rbx, %rcx + subq %rbx, %rsi + addl %eax, %ecx + movq %rdx, %rax + andq $-4, %rax + shrl $3, %ecx + rep movsq + addq $4, %rax + subq %rax, %r10 + movq %r10, -176(%rbp) + je .L1467 +.L1478: + movq -176(%rbp), %rbx + movq -88(%rbp), %rdi + salq $2, %rax + movl $4, %ecx + leaq 0(%r13,%rax), %rsi + addq %rax, %rdi + leaq 0(,%rbx,4), %rdx + testq %rbx, %rbx + cmove %rcx, %rdx + call memcpy@PLT + jmp .L1467 +.L1767: + movdqa %xmm1, %xmm5 + pand %xmm4, %xmm1 + pandn %xmm2, %xmm5 + por %xmm5, %xmm1 + pcmpgtd %xmm0, %xmm1 + movmskps %xmm1, %eax + testl %eax, %eax + jne .L1752 + movdqa %xmm3, %xmm1 + movl $64, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1518: + movq -88(%rbp), %rbx + leaq (%rcx,%rax,4), %rdx + addq $1, %rax + movdqu (%rbx,%rdx,4), %xmm2 + movdqa %xmm2, %xmm4 + pcmpgtd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pand %xmm4, %xmm2 + pandn %xmm1, %xmm5 + movdqa %xmm2, %xmm1 + por %xmm5, %xmm1 + cmpq $16, %rax + jne .L1518 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm0, %xmm2 + movmskps %xmm2, %eax + testl %eax, %eax + jne .L1752 + leaq 64(%rsi), %rax + cmpq %rax, -176(%rbp) + jb .L1771 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1518 +.L1770: + pxor %xmm5, %xmm1 + movq -88(%rbp), %rbx + movmskps %xmm1, %eax + rep bsfl %eax, %eax + cltq + movd (%rbx,%rax,4), %xmm6 + leaq 4(%r14), %rax + pshufd $0, %xmm6, %xmm1 + movdqa %xmm1, %xmm4 + movaps %xmm1, -64(%rbp) + cmpq %r15, %rax + ja .L1507 +.L1508: + movq -88(%rbp), %rbx + movdqa -80(%rbp), %xmm7 + movq %rax, %r14 + movups %xmm7, -16(%rbx,%rax,4) + addq $4, %rax + cmpq %r15, %rax + jbe .L1508 +.L1507: + subq %r14, %r15 + leaq 0(,%r14,4), %rdx + movq %r15, %xmm1 + pshufd $0, %xmm1, %xmm1 + pcmpgtd -208(%rbp), %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L1509 + movq -88(%rbp), %rax + movl %r12d, (%rax,%r14,4) +.L1509: + pshufd $85, %xmm1, %xmm5 + movd %xmm5, %eax + testl %eax, %eax + je .L1510 + movq -88(%rbp), %rax + pshufd $85, %xmm2, %xmm5 + movd %xmm5, 4(%rax,%rdx) +.L1510: + movdqa %xmm1, %xmm5 + punpckhdq %xmm1, %xmm5 + movd %xmm5, %eax + testl %eax, %eax + je .L1511 + movq -88(%rbp), %rax + movdqa %xmm2, %xmm5 + punpckhdq %xmm2, %xmm5 + movd %xmm5, 8(%rax,%rdx) +.L1511: + pshufd $255, %xmm1, %xmm1 + movd %xmm1, %eax + testl %eax, %eax + je .L1504 + movq -88(%rbp), %rax + pshufd $255, %xmm2, %xmm1 + movd %xmm1, 12(%rax,%rdx) + jmp .L1504 +.L1585: + xorl %ecx, %ecx + jmp .L1513 + .cfi_endproc +.LFE18806: + .size _ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0, .-_ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + .section .text._ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0, @function +_ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0: +.LFB18808: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsi, %rax + salq $2, %rax + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + leaq (%rdi,%rax), %r15 + pushq %r14 + .cfi_offset 14, -32 + leaq (%r15,%rax), %r14 + pushq %r13 + .cfi_offset 13, -40 + leaq (%r14,%rax), %r13 + pushq %r12 + .cfi_offset 12, -48 + leaq 0(%r13,%rax), %r12 + pushq %rbx + .cfi_offset 3, -56 + leaq (%r12,%rax), %rbx + leaq (%rbx,%rax), %r11 + leaq (%r11,%rax), %r10 + leaq (%r10,%rax), %r9 + andq $-32, %rsp + subq $200, %rsp + leaq (%r9,%rax), %r8 + movq %rdi, 176(%rsp) + leaq (%r8,%rax), %rdi + movq %rsi, 136(%rsp) + leaq (%rdi,%rax), %rsi + leaq (%rsi,%rax), %rcx + leaq (%rcx,%rax), %rdx + movq %rdx, 184(%rsp) + addq %rax, %rdx + addq %rdx, %rax + movq %rdx, 192(%rsp) + movq 176(%rsp), %rdx + vmovdqu (%rdx), %ymm5 + vpmaxsd (%r15), %ymm5, %ymm9 + vpminsd (%r15), %ymm5, %ymm15 + vmovdqu (%r14), %ymm5 + vpminsd 0(%r13), %ymm5, %ymm14 + vpmaxsd 0(%r13), %ymm5, %ymm8 + vmovdqu (%r12), %ymm5 + vpminsd (%rbx), %ymm5, %ymm13 + vpmaxsd (%rbx), %ymm5, %ymm1 + vmovdqu (%r11), %ymm5 + vpmaxsd (%r10), %ymm5, %ymm7 + vpminsd (%r10), %ymm5, %ymm3 + vmovdqu (%r9), %ymm5 + vpmaxsd (%r8), %ymm5, %ymm6 + vpminsd (%r8), %ymm5, %ymm12 + vmovdqu (%rdi), %ymm5 + vpminsd (%rsi), %ymm5, %ymm11 + vpmaxsd (%rsi), %ymm5, %ymm5 + movq 184(%rsp), %rdx + cmpq $1, 136(%rsp) + vmovdqu (%rdx), %ymm2 + movq 192(%rsp), %rdx + vmovdqa %ymm2, 104(%rsp) + vmovdqa 104(%rsp), %ymm2 + vpminsd (%rcx), %ymm2, %ymm4 + vpmaxsd (%rcx), %ymm2, %ymm0 + vmovdqu (%rdx), %ymm2 + vmovdqa %ymm4, 72(%rsp) + vmovdqa %ymm2, 104(%rsp) + vmovdqa 104(%rsp), %ymm10 + vpmaxsd (%rax), %ymm10, %ymm4 + vpminsd %ymm14, %ymm15, %ymm10 + vpmaxsd %ymm14, %ymm15, %ymm15 + vpminsd %ymm8, %ymm9, %ymm14 + vmovdqa 104(%rsp), %ymm2 + vpmaxsd %ymm8, %ymm9, %ymm8 + vpminsd (%rax), %ymm2, %ymm2 + vmovdqa %ymm8, 104(%rsp) + vpminsd %ymm7, %ymm1, %ymm9 + vpminsd %ymm3, %ymm13, %ymm8 + vpmaxsd %ymm7, %ymm1, %ymm1 + vpmaxsd %ymm3, %ymm13, %ymm3 + vpminsd %ymm11, %ymm12, %ymm7 + vpmaxsd %ymm5, %ymm6, %ymm13 + vpmaxsd %ymm11, %ymm12, %ymm12 + vpminsd %ymm5, %ymm6, %ymm11 + vmovdqa 72(%rsp), %ymm6 + vpminsd %ymm6, %ymm2, %ymm5 + vpmaxsd %ymm6, %ymm2, %ymm2 + vpminsd %ymm4, %ymm0, %ymm6 + vpmaxsd %ymm4, %ymm0, %ymm0 + vpminsd %ymm8, %ymm10, %ymm4 + vpmaxsd %ymm8, %ymm10, %ymm10 + vpminsd %ymm9, %ymm14, %ymm8 + vpmaxsd %ymm9, %ymm14, %ymm14 + vpminsd %ymm3, %ymm15, %ymm9 + vpmaxsd %ymm3, %ymm15, %ymm15 + vpminsd 104(%rsp), %ymm1, %ymm3 + vpmaxsd 104(%rsp), %ymm1, %ymm1 + vmovdqa %ymm1, 104(%rsp) + vpminsd %ymm5, %ymm7, %ymm1 + vpmaxsd %ymm5, %ymm7, %ymm5 + vpminsd %ymm6, %ymm11, %ymm7 + vpmaxsd %ymm6, %ymm11, %ymm6 + vpminsd %ymm2, %ymm12, %ymm11 + vpmaxsd %ymm2, %ymm12, %ymm12 + vpminsd %ymm0, %ymm13, %ymm2 + vpmaxsd %ymm0, %ymm13, %ymm0 + vpminsd %ymm1, %ymm4, %ymm13 + vpmaxsd %ymm1, %ymm4, %ymm1 + vpminsd %ymm12, %ymm15, %ymm4 + vpmaxsd %ymm12, %ymm15, %ymm12 + vmovdqa 104(%rsp), %ymm15 + vmovdqa %ymm13, 72(%rsp) + vpminsd %ymm7, %ymm8, %ymm13 + vpmaxsd %ymm7, %ymm8, %ymm7 + vpminsd %ymm11, %ymm9, %ymm8 + vpmaxsd %ymm11, %ymm9, %ymm11 + vpminsd %ymm2, %ymm3, %ymm9 + vpmaxsd %ymm2, %ymm3, %ymm2 + vpminsd %ymm5, %ymm10, %ymm3 + vpmaxsd %ymm5, %ymm10, %ymm5 + vpminsd %ymm6, %ymm14, %ymm10 + vpmaxsd %ymm6, %ymm14, %ymm6 + vpminsd %ymm15, %ymm0, %ymm14 + vpmaxsd %ymm15, %ymm0, %ymm0 + vmovdqa %ymm0, 104(%rsp) + vpminsd %ymm11, %ymm10, %ymm0 + vpmaxsd %ymm11, %ymm10, %ymm10 + vpminsd %ymm7, %ymm4, %ymm11 + vpmaxsd %ymm7, %ymm4, %ymm4 + vpminsd %ymm2, %ymm14, %ymm7 + vpmaxsd %ymm2, %ymm14, %ymm14 + vpminsd %ymm1, %ymm3, %ymm2 + vpmaxsd %ymm1, %ymm3, %ymm3 + vpminsd %ymm8, %ymm13, %ymm1 + vpminsd %ymm5, %ymm9, %ymm15 + vpmaxsd %ymm8, %ymm13, %ymm8 + vpmaxsd %ymm5, %ymm9, %ymm5 + vpminsd %ymm12, %ymm6, %ymm9 + vpmaxsd %ymm12, %ymm6, %ymm12 + vpminsd %ymm2, %ymm1, %ymm6 + vpmaxsd %ymm2, %ymm1, %ymm1 + vmovdqa %ymm6, 40(%rsp) + vpminsd %ymm3, %ymm8, %ymm6 + vpmaxsd %ymm3, %ymm8, %ymm8 + vpmaxsd %ymm12, %ymm14, %ymm3 + vpminsd %ymm9, %ymm7, %ymm2 + vmovdqa %ymm3, 8(%rsp) + vpminsd %ymm1, %ymm6, %ymm3 + vpmaxsd %ymm9, %ymm7, %ymm9 + vpminsd %ymm5, %ymm2, %ymm13 + vpminsd %ymm12, %ymm14, %ymm7 + vmovdqa %ymm3, -24(%rsp) + vpminsd %ymm11, %ymm0, %ymm12 + vpmaxsd %ymm11, %ymm0, %ymm14 + vpminsd %ymm10, %ymm4, %ymm3 + vpminsd %ymm8, %ymm15, %ymm11 + vpmaxsd %ymm5, %ymm2, %ymm5 + vpmaxsd %ymm1, %ymm6, %ymm6 + vpmaxsd %ymm10, %ymm4, %ymm4 + vpmaxsd %ymm8, %ymm15, %ymm1 + vpminsd %ymm12, %ymm11, %ymm2 + vpminsd %ymm5, %ymm4, %ymm8 + vpmaxsd %ymm12, %ymm11, %ymm11 + vpminsd %ymm3, %ymm13, %ymm15 + vpminsd %ymm1, %ymm14, %ymm12 + vpmaxsd %ymm3, %ymm13, %ymm13 + vpmaxsd %ymm1, %ymm14, %ymm1 + vpminsd %ymm9, %ymm7, %ymm10 + vpminsd %ymm1, %ymm15, %ymm3 + vpmaxsd %ymm5, %ymm4, %ymm4 + vpmaxsd %ymm1, %ymm15, %ymm15 + vpminsd %ymm8, %ymm13, %ymm0 + vpminsd %ymm12, %ymm11, %ymm5 + vpmaxsd %ymm12, %ymm11, %ymm11 + vpminsd %ymm4, %ymm10, %ymm1 + vpmaxsd %ymm9, %ymm7, %ymm7 + vpmaxsd %ymm4, %ymm10, %ymm4 + vpminsd %ymm6, %ymm2, %ymm9 + vpminsd %ymm3, %ymm11, %ymm10 + vpminsd %ymm0, %ymm15, %ymm14 + vpmaxsd %ymm6, %ymm2, %ymm2 + vpmaxsd %ymm8, %ymm13, %ymm13 + vpmaxsd %ymm3, %ymm11, %ymm11 + vpmaxsd %ymm0, %ymm15, %ymm15 + jbe .L1778 + vmovdqa 72(%rsp), %ymm3 + vpshufd $177, %ymm14, %ymm14 + vpshufd $177, %ymm15, %ymm15 + vpshufd $177, 104(%rsp), %ymm6 + vpshufd $177, %ymm13, %ymm13 + vpshufd $177, %ymm1, %ymm1 + vpshufd $177, %ymm4, %ymm4 + vpshufd $177, 8(%rsp), %ymm8 + vpminsd %ymm3, %ymm6, %ymm12 + vpmaxsd %ymm3, %ymm6, %ymm6 + vmovdqa 40(%rsp), %ymm3 + vpshufd $177, %ymm7, %ymm7 + cmpq $3, 136(%rsp) + vmovdqa %ymm6, 104(%rsp) + vpminsd %ymm3, %ymm8, %ymm0 + vpmaxsd %ymm3, %ymm8, %ymm8 + vmovdqa -24(%rsp), %ymm3 + vpshufd $177, %ymm8, %ymm8 + vpminsd %ymm3, %ymm7, %ymm6 + vpmaxsd %ymm3, %ymm7, %ymm7 + vpminsd %ymm4, %ymm9, %ymm3 + vpmaxsd %ymm4, %ymm9, %ymm9 + vmovdqa %ymm6, 72(%rsp) + vpshufd $177, %ymm7, %ymm7 + vpminsd %ymm1, %ymm2, %ymm4 + vpmaxsd %ymm1, %ymm2, %ymm2 + vmovdqa %ymm3, 40(%rsp) + vmovdqa 72(%rsp), %ymm3 + vpminsd %ymm13, %ymm5, %ymm1 + vpmaxsd %ymm13, %ymm5, %ymm5 + vpshufd $177, %ymm4, %ymm4 + vpshufd $177, 104(%rsp), %ymm6 + vpminsd %ymm15, %ymm10, %ymm13 + vpmaxsd %ymm15, %ymm10, %ymm10 + vpshufd $177, %ymm1, %ymm1 + vpminsd %ymm14, %ymm11, %ymm15 + vpmaxsd %ymm14, %ymm11, %ymm11 + vpshufd $177, %ymm13, %ymm13 + vpshufd $177, %ymm15, %ymm15 + vpshufd $177, %ymm9, %ymm9 + vpminsd %ymm15, %ymm12, %ymm14 + vpmaxsd %ymm15, %ymm12, %ymm12 + vpminsd %ymm6, %ymm11, %ymm15 + vpmaxsd %ymm6, %ymm11, %ymm11 + vpshufd $177, %ymm12, %ymm12 + vmovdqa %ymm15, 104(%rsp) + vpminsd %ymm8, %ymm10, %ymm15 + vpmaxsd %ymm8, %ymm10, %ymm10 + vpshufd $177, %ymm11, %ymm11 + vpminsd %ymm3, %ymm1, %ymm8 + vpmaxsd %ymm3, %ymm1, %ymm3 + vpshufd $177, %ymm10, %ymm10 + vpminsd %ymm7, %ymm5, %ymm1 + vpmaxsd %ymm7, %ymm5, %ymm5 + vmovdqa 40(%rsp), %ymm7 + vpshufd $177, %ymm8, %ymm8 + vpminsd %ymm13, %ymm0, %ymm6 + vpmaxsd %ymm13, %ymm0, %ymm0 + vpshufd $177, %ymm1, %ymm1 + vpminsd %ymm7, %ymm4, %ymm13 + vpmaxsd %ymm7, %ymm4, %ymm4 + vpshufd $177, %ymm0, %ymm0 + vpshufd $177, %ymm13, %ymm13 + vpminsd %ymm9, %ymm2, %ymm7 + vpmaxsd %ymm9, %ymm2, %ymm2 + vpminsd %ymm13, %ymm14, %ymm9 + vpmaxsd %ymm13, %ymm14, %ymm14 + vpshufd $177, %ymm7, %ymm7 + vpminsd %ymm8, %ymm6, %ymm13 + vpmaxsd %ymm8, %ymm6, %ymm6 + vpshufd $177, %ymm14, %ymm14 + vpminsd %ymm12, %ymm4, %ymm8 + vpmaxsd %ymm12, %ymm4, %ymm4 + vmovdqa 104(%rsp), %ymm12 + vmovdqa %ymm13, 72(%rsp) + vpminsd %ymm0, %ymm3, %ymm13 + vpmaxsd %ymm0, %ymm3, %ymm3 + vpshufd $177, %ymm4, %ymm4 + vpminsd %ymm12, %ymm7, %ymm0 + vpmaxsd %ymm12, %ymm7, %ymm7 + vpshufd $177, %ymm13, %ymm13 + vpminsd %ymm1, %ymm15, %ymm12 + vpmaxsd %ymm1, %ymm15, %ymm1 + vpshufd $177, %ymm7, %ymm7 + vpminsd %ymm11, %ymm2, %ymm15 + vpshufd $177, %ymm12, %ymm12 + vpmaxsd %ymm11, %ymm2, %ymm2 + vmovdqa %ymm15, 104(%rsp) + vpminsd %ymm10, %ymm5, %ymm11 + vpmaxsd %ymm10, %ymm5, %ymm5 + vpshufd $177, 72(%rsp), %ymm15 + vpminsd %ymm15, %ymm9, %ymm10 + vpmaxsd %ymm15, %ymm9, %ymm9 + vpshufd $177, %ymm2, %ymm2 + vpminsd %ymm14, %ymm6, %ymm15 + vpmaxsd %ymm14, %ymm6, %ymm6 + vpshufd $177, %ymm11, %ymm11 + vpminsd %ymm13, %ymm8, %ymm14 + vpmaxsd %ymm13, %ymm8, %ymm8 + vpminsd %ymm4, %ymm3, %ymm13 + vpmaxsd %ymm4, %ymm3, %ymm3 + vpminsd %ymm12, %ymm0, %ymm4 + vpmaxsd %ymm12, %ymm0, %ymm0 + vmovdqa %ymm4, 72(%rsp) + vmovdqa 104(%rsp), %ymm4 + vpminsd %ymm7, %ymm1, %ymm12 + vpmaxsd %ymm7, %ymm1, %ymm1 + vpminsd %ymm4, %ymm11, %ymm7 + vpmaxsd %ymm4, %ymm11, %ymm11 + vpminsd %ymm2, %ymm5, %ymm4 + vpmaxsd %ymm2, %ymm5, %ymm5 + vpshufd $177, %ymm10, %ymm2 + vmovdqa %ymm5, 40(%rsp) + vpminsd %ymm2, %ymm10, %ymm5 + vpmaxsd %ymm2, %ymm10, %ymm10 + vpshufd $177, %ymm9, %ymm2 + vpblendd $85, %ymm5, %ymm10, %ymm10 + vpminsd %ymm2, %ymm9, %ymm5 + vpmaxsd %ymm2, %ymm9, %ymm9 + vmovdqa %ymm4, 104(%rsp) + vpblendd $85, %ymm5, %ymm9, %ymm2 + vmovdqa %ymm2, 8(%rsp) + vpshufd $177, %ymm15, %ymm2 + vpminsd %ymm2, %ymm15, %ymm5 + vpmaxsd %ymm2, %ymm15, %ymm15 + vpshufd $177, %ymm6, %ymm2 + vpblendd $85, %ymm5, %ymm15, %ymm15 + vpminsd %ymm2, %ymm6, %ymm5 + vpmaxsd %ymm2, %ymm6, %ymm6 + vpshufd $177, %ymm14, %ymm2 + vpblendd $85, %ymm5, %ymm6, %ymm6 + vpminsd %ymm2, %ymm14, %ymm5 + vpmaxsd %ymm2, %ymm14, %ymm14 + vpshufd $177, %ymm8, %ymm2 + vmovdqa %ymm6, -24(%rsp) + vpblendd $85, %ymm5, %ymm14, %ymm14 + vpminsd %ymm2, %ymm8, %ymm5 + vpmaxsd %ymm2, %ymm8, %ymm8 + vpshufd $177, %ymm13, %ymm2 + vpblendd $85, %ymm5, %ymm8, %ymm6 + vpshufd $177, %ymm7, %ymm8 + vpminsd %ymm2, %ymm13, %ymm5 + vpmaxsd %ymm2, %ymm13, %ymm13 + vpshufd $177, %ymm3, %ymm2 + vmovdqa %ymm6, -56(%rsp) + vpblendd $85, %ymm5, %ymm13, %ymm6 + vpminsd %ymm2, %ymm3, %ymm5 + vpmaxsd %ymm2, %ymm3, %ymm3 + vpblendd $85, %ymm5, %ymm3, %ymm13 + vmovdqa 72(%rsp), %ymm5 + vmovdqa %ymm6, -88(%rsp) + vpshufd $177, %ymm5, %ymm6 + vpminsd %ymm5, %ymm6, %ymm2 + vpmaxsd %ymm5, %ymm6, %ymm6 + vpblendd $85, %ymm2, %ymm6, %ymm6 + vpshufd $177, %ymm0, %ymm2 + vpminsd %ymm2, %ymm0, %ymm3 + vpmaxsd %ymm2, %ymm0, %ymm0 + vpblendd $85, %ymm3, %ymm0, %ymm2 + vpshufd $177, %ymm12, %ymm0 + vpminsd %ymm0, %ymm12, %ymm3 + vpmaxsd %ymm0, %ymm12, %ymm0 + vpblendd $85, %ymm3, %ymm0, %ymm0 + vpshufd $177, %ymm1, %ymm3 + vpminsd %ymm3, %ymm1, %ymm4 + vpmaxsd %ymm3, %ymm1, %ymm1 + vpblendd $85, %ymm4, %ymm1, %ymm1 + vmovdqa 104(%rsp), %ymm4 + vpminsd %ymm8, %ymm7, %ymm3 + vpmaxsd %ymm8, %ymm7, %ymm8 + vpshufd $177, %ymm11, %ymm7 + vpblendd $85, %ymm3, %ymm8, %ymm8 + vpshufd $177, %ymm4, %ymm9 + vpminsd %ymm7, %ymm11, %ymm3 + vpmaxsd %ymm7, %ymm11, %ymm7 + vpblendd $85, %ymm3, %ymm7, %ymm7 + vpminsd %ymm4, %ymm9, %ymm3 + vpmaxsd %ymm4, %ymm9, %ymm9 + vmovdqa 40(%rsp), %ymm4 + vpblendd $85, %ymm3, %ymm9, %ymm9 + vpshufd $177, %ymm4, %ymm5 + vpminsd %ymm4, %ymm5, %ymm3 + vpmaxsd %ymm4, %ymm5, %ymm5 + vpblendd $85, %ymm3, %ymm5, %ymm5 + jbe .L1779 + vmovdqa 8(%rsp), %ymm3 + vmovdqa -24(%rsp), %ymm4 + vpshufd $27, %ymm1, %ymm1 + vpshufd $27, %ymm8, %ymm8 + vpshufd $27, %ymm7, %ymm7 + vpshufd $27, %ymm9, %ymm9 + vpshufd $27, %ymm5, %ymm5 + cmpq $7, 136(%rsp) + vpminsd %ymm5, %ymm10, %ymm12 + vpshufd $27, %ymm2, %ymm11 + vpmaxsd %ymm5, %ymm10, %ymm5 + vpminsd %ymm7, %ymm15, %ymm2 + vpminsd %ymm3, %ymm9, %ymm10 + vpshufd $27, %ymm6, %ymm6 + vpmaxsd %ymm3, %ymm9, %ymm9 + vpmaxsd %ymm7, %ymm15, %ymm7 + vmovdqa -88(%rsp), %ymm15 + vpshufd $27, %ymm0, %ymm0 + vpminsd %ymm4, %ymm8, %ymm3 + vpmaxsd %ymm4, %ymm8, %ymm8 + vpshufd $27, %ymm5, %ymm5 + vpminsd %ymm1, %ymm14, %ymm4 + vpmaxsd %ymm1, %ymm14, %ymm1 + vmovdqa -56(%rsp), %ymm14 + vmovdqa %ymm3, 104(%rsp) + vpshufd $27, %ymm7, %ymm7 + vpshufd $27, %ymm9, %ymm9 + vpshufd $27, %ymm4, %ymm4 + vpminsd %ymm14, %ymm0, %ymm3 + vpmaxsd %ymm14, %ymm0, %ymm0 + vpshufd $27, %ymm8, %ymm8 + vpminsd %ymm15, %ymm11, %ymm14 + vpmaxsd %ymm15, %ymm11, %ymm11 + vpshufd $27, %ymm3, %ymm3 + vpminsd %ymm6, %ymm13, %ymm15 + vpmaxsd %ymm6, %ymm13, %ymm6 + vpshufd $27, %ymm14, %ymm14 + vpshufd $27, %ymm15, %ymm15 + vpminsd %ymm15, %ymm12, %ymm13 + vpmaxsd %ymm15, %ymm12, %ymm12 + vpminsd %ymm5, %ymm6, %ymm15 + vpmaxsd %ymm5, %ymm6, %ymm6 + vpshufd $27, %ymm12, %ymm12 + vmovdqa %ymm15, 72(%rsp) + vpminsd %ymm9, %ymm11, %ymm15 + vpmaxsd %ymm9, %ymm11, %ymm11 + vpshufd $27, %ymm6, %ymm6 + vpminsd %ymm3, %ymm2, %ymm9 + vpmaxsd %ymm3, %ymm2, %ymm2 + vpshufd $27, %ymm11, %ymm11 + vpminsd %ymm7, %ymm0, %ymm3 + vpmaxsd %ymm7, %ymm0, %ymm0 + vmovdqa 104(%rsp), %ymm7 + vpshufd $27, %ymm9, %ymm9 + vpminsd %ymm14, %ymm10, %ymm5 + vpmaxsd %ymm14, %ymm10, %ymm10 + vpshufd $27, %ymm3, %ymm3 + vpminsd %ymm7, %ymm4, %ymm14 + vpmaxsd %ymm7, %ymm4, %ymm4 + vpshufd $27, %ymm14, %ymm14 + vpminsd %ymm8, %ymm1, %ymm7 + vpmaxsd %ymm8, %ymm1, %ymm1 + vpshufd $27, %ymm10, %ymm8 + vpminsd %ymm14, %ymm13, %ymm10 + vpmaxsd %ymm14, %ymm13, %ymm14 + vpminsd %ymm9, %ymm5, %ymm13 + vpmaxsd %ymm9, %ymm5, %ymm5 + vpshufd $27, %ymm7, %ymm7 + vpminsd %ymm12, %ymm4, %ymm9 + vpmaxsd %ymm12, %ymm4, %ymm4 + vmovdqa 72(%rsp), %ymm12 + vmovdqa %ymm13, 104(%rsp) + vpminsd %ymm8, %ymm2, %ymm13 + vpshufd $27, %ymm14, %ymm14 + vpmaxsd %ymm8, %ymm2, %ymm2 + vpshufd $27, %ymm13, %ymm13 + vpminsd %ymm12, %ymm7, %ymm8 + vpshufd $27, %ymm4, %ymm4 + vpmaxsd %ymm12, %ymm7, %ymm7 + vpminsd %ymm3, %ymm15, %ymm12 + vpmaxsd %ymm3, %ymm15, %ymm3 + vpminsd %ymm6, %ymm1, %ymm15 + vpshufd $27, %ymm12, %ymm12 + vmovdqa %ymm15, 72(%rsp) + vpmaxsd %ymm6, %ymm1, %ymm1 + vpminsd %ymm11, %ymm0, %ymm6 + vpshufd $27, 104(%rsp), %ymm15 + vpmaxsd %ymm11, %ymm0, %ymm0 + vpminsd %ymm15, %ymm10, %ymm11 + vpshufd $27, %ymm7, %ymm7 + vpmaxsd %ymm15, %ymm10, %ymm10 + vpminsd %ymm14, %ymm5, %ymm15 + vpshufd $27, %ymm1, %ymm1 + vpmaxsd %ymm14, %ymm5, %ymm5 + vpminsd %ymm13, %ymm9, %ymm14 + vpshufd $27, %ymm6, %ymm6 + vpmaxsd %ymm13, %ymm9, %ymm9 + vpminsd %ymm4, %ymm2, %ymm13 + vpmaxsd %ymm4, %ymm2, %ymm2 + vpminsd %ymm12, %ymm8, %ymm4 + vmovdqa %ymm2, 40(%rsp) + vmovdqa 72(%rsp), %ymm2 + vpmaxsd %ymm12, %ymm8, %ymm8 + vpminsd %ymm7, %ymm3, %ymm12 + vpmaxsd %ymm7, %ymm3, %ymm3 + vpminsd %ymm2, %ymm6, %ymm7 + vpmaxsd %ymm2, %ymm6, %ymm6 + vpminsd %ymm1, %ymm0, %ymm2 + vpmaxsd %ymm1, %ymm0, %ymm0 + vmovdqa %ymm0, 72(%rsp) + vpshufd $27, %ymm11, %ymm0 + vpminsd %ymm0, %ymm11, %ymm1 + vpmaxsd %ymm0, %ymm11, %ymm11 + vpshufd $27, %ymm10, %ymm0 + vmovdqa %ymm2, 104(%rsp) + vpblendd $51, %ymm1, %ymm11, %ymm11 + vpminsd %ymm0, %ymm10, %ymm1 + vpmaxsd %ymm0, %ymm10, %ymm10 + vmovdqa 40(%rsp), %ymm2 + vpshufd $27, %ymm15, %ymm0 + vpblendd $51, %ymm1, %ymm10, %ymm10 + vpminsd %ymm0, %ymm15, %ymm1 + vpmaxsd %ymm0, %ymm15, %ymm15 + vpshufd $27, %ymm5, %ymm0 + vpblendd $51, %ymm1, %ymm15, %ymm15 + vpminsd %ymm0, %ymm5, %ymm1 + vpmaxsd %ymm0, %ymm5, %ymm5 + vpshufd $27, %ymm14, %ymm0 + vpblendd $51, %ymm1, %ymm5, %ymm5 + vpminsd %ymm0, %ymm14, %ymm1 + vpmaxsd %ymm0, %ymm14, %ymm14 + vpshufd $27, %ymm9, %ymm0 + vpblendd $51, %ymm1, %ymm14, %ymm14 + vpminsd %ymm0, %ymm9, %ymm1 + vpmaxsd %ymm0, %ymm9, %ymm9 + vpshufd $27, %ymm13, %ymm0 + vpblendd $51, %ymm1, %ymm9, %ymm9 + vpminsd %ymm0, %ymm13, %ymm1 + vpmaxsd %ymm0, %ymm13, %ymm13 + vpshufd $27, %ymm2, %ymm0 + vpblendd $51, %ymm1, %ymm13, %ymm13 + vpminsd %ymm2, %ymm0, %ymm1 + vpmaxsd %ymm2, %ymm0, %ymm2 + vpshufd $27, %ymm4, %ymm0 + vpblendd $51, %ymm1, %ymm2, %ymm2 + vpminsd %ymm0, %ymm4, %ymm1 + vpmaxsd %ymm0, %ymm4, %ymm4 + vpshufd $27, %ymm8, %ymm0 + vpblendd $51, %ymm1, %ymm4, %ymm4 + vpminsd %ymm0, %ymm8, %ymm1 + vpmaxsd %ymm0, %ymm8, %ymm8 + vpshufd $27, %ymm12, %ymm0 + vpblendd $51, %ymm1, %ymm8, %ymm8 + vpminsd %ymm0, %ymm12, %ymm1 + vpmaxsd %ymm0, %ymm12, %ymm12 + vpshufd $27, %ymm3, %ymm0 + vpblendd $51, %ymm1, %ymm12, %ymm12 + vpminsd %ymm0, %ymm3, %ymm1 + vpmaxsd %ymm0, %ymm3, %ymm3 + vpshufd $27, %ymm7, %ymm0 + vpblendd $51, %ymm1, %ymm3, %ymm3 + vpminsd %ymm0, %ymm7, %ymm1 + vpmaxsd %ymm0, %ymm7, %ymm7 + vpshufd $27, %ymm6, %ymm0 + vpblendd $51, %ymm1, %ymm7, %ymm7 + vpminsd %ymm0, %ymm6, %ymm1 + vpmaxsd %ymm0, %ymm6, %ymm6 + vpblendd $51, %ymm1, %ymm6, %ymm6 + vmovdqa 104(%rsp), %ymm1 + vpshufd $27, %ymm1, %ymm0 + vpminsd %ymm1, %ymm0, %ymm1 + vpmaxsd 104(%rsp), %ymm0, %ymm0 + vpblendd $51, %ymm1, %ymm0, %ymm1 + vmovdqa %ymm1, 104(%rsp) + vmovdqa 72(%rsp), %ymm1 + vpshufd $27, %ymm1, %ymm0 + vpminsd %ymm1, %ymm0, %ymm1 + vpmaxsd 72(%rsp), %ymm0, %ymm0 + vpblendd $51, %ymm1, %ymm0, %ymm0 + vmovdqa %ymm0, 72(%rsp) + vpshufd $177, %ymm11, %ymm0 + vpminsd %ymm0, %ymm11, %ymm1 + vpmaxsd %ymm0, %ymm11, %ymm11 + vpshufd $177, %ymm10, %ymm0 + vpblendd $85, %ymm1, %ymm11, %ymm11 + vpminsd %ymm0, %ymm10, %ymm1 + vpmaxsd %ymm0, %ymm10, %ymm10 + vpshufd $177, %ymm15, %ymm0 + vpblendd $85, %ymm1, %ymm10, %ymm10 + vmovdqa %ymm11, 40(%rsp) + vpminsd %ymm0, %ymm15, %ymm1 + vpmaxsd %ymm0, %ymm15, %ymm15 + vpshufd $177, %ymm5, %ymm0 + vpblendd $85, %ymm1, %ymm15, %ymm11 + vpminsd %ymm0, %ymm5, %ymm1 + vpmaxsd %ymm0, %ymm5, %ymm5 + vpshufd $177, %ymm14, %ymm0 + vpblendd $85, %ymm1, %ymm5, %ymm5 + vmovdqa %ymm11, 8(%rsp) + vpshufd $177, %ymm8, %ymm15 + vpminsd %ymm0, %ymm14, %ymm1 + vpmaxsd %ymm0, %ymm14, %ymm14 + vpshufd $177, %ymm9, %ymm0 + vpblendd $85, %ymm1, %ymm14, %ymm11 + vpminsd %ymm0, %ymm9, %ymm1 + vpmaxsd %ymm0, %ymm9, %ymm9 + vpshufd $177, %ymm13, %ymm0 + vmovdqa %ymm11, -24(%rsp) + vpshufd $177, %ymm4, %ymm14 + vpblendd $85, %ymm1, %ymm9, %ymm11 + vpminsd %ymm0, %ymm13, %ymm1 + vpmaxsd %ymm0, %ymm13, %ymm13 + vpshufd $177, %ymm2, %ymm0 + vmovdqa %ymm11, -56(%rsp) + vpblendd $85, %ymm1, %ymm13, %ymm11 + vpminsd %ymm0, %ymm2, %ymm1 + vpmaxsd %ymm0, %ymm2, %ymm2 + vpshufd $177, %ymm12, %ymm13 + vmovdqa %ymm11, -88(%rsp) + vpminsd %ymm14, %ymm4, %ymm0 + vpblendd $85, %ymm1, %ymm2, %ymm11 + vpminsd %ymm15, %ymm8, %ymm1 + vpmaxsd %ymm15, %ymm8, %ymm15 + vpblendd $85, %ymm1, %ymm15, %ymm15 + vpminsd %ymm13, %ymm12, %ymm1 + vpmaxsd %ymm13, %ymm12, %ymm13 + vpmaxsd %ymm14, %ymm4, %ymm14 + vpblendd $85, %ymm1, %ymm13, %ymm13 + vpshufd $177, %ymm3, %ymm1 + vpshufd $177, %ymm7, %ymm4 + vpblendd $85, %ymm0, %ymm14, %ymm14 + vpminsd %ymm1, %ymm3, %ymm0 + vpmaxsd %ymm1, %ymm3, %ymm1 + vmovdqa 72(%rsp), %ymm3 + vpblendd $85, %ymm0, %ymm1, %ymm1 + vpminsd %ymm4, %ymm7, %ymm0 + vpmaxsd %ymm4, %ymm7, %ymm4 + vpshufd $177, %ymm6, %ymm7 + vpblendd $85, %ymm0, %ymm4, %ymm4 + vpshufd $177, %ymm3, %ymm12 + vpminsd %ymm7, %ymm6, %ymm0 + vpmaxsd %ymm7, %ymm6, %ymm7 + vmovdqa 104(%rsp), %ymm6 + vpblendd $85, %ymm0, %ymm7, %ymm7 + vpshufd $177, %ymm6, %ymm9 + vpminsd %ymm6, %ymm9, %ymm0 + vpmaxsd %ymm6, %ymm9, %ymm9 + vpblendd $85, %ymm0, %ymm9, %ymm9 + vpminsd %ymm3, %ymm12, %ymm0 + vpmaxsd %ymm3, %ymm12, %ymm12 + vpblendd $85, %ymm0, %ymm12, %ymm12 + jbe .L1780 + vmovdqa .LC13(%rip), %ymm2 + vmovdqa 40(%rsp), %ymm3 + vpermd %ymm12, %ymm2, %ymm12 + vpermd %ymm15, %ymm2, %ymm0 + vpermd %ymm13, %ymm2, %ymm6 + vpmaxsd %ymm3, %ymm12, %ymm15 + vpminsd %ymm3, %ymm12, %ymm13 + vpermd %ymm14, %ymm2, %ymm14 + vmovdqa 8(%rsp), %ymm3 + vmovdqa %ymm14, 136(%rsp) + vpermd %ymm7, %ymm2, %ymm7 + vpermd %ymm1, %ymm2, %ymm14 + vpermd %ymm4, %ymm2, %ymm1 + vmovdqa -24(%rsp), %ymm4 + vpminsd %ymm3, %ymm7, %ymm8 + vpmaxsd %ymm3, %ymm7, %ymm7 + vmovdqa %ymm6, 104(%rsp) + vpminsd %ymm1, %ymm5, %ymm3 + vpermd %ymm9, %ymm2, %ymm6 + vpmaxsd %ymm1, %ymm5, %ymm1 + vmovdqa 104(%rsp), %ymm5 + vmovdqa %ymm3, 72(%rsp) + vpminsd %ymm4, %ymm14, %ymm3 + vpermd %ymm7, %ymm2, %ymm7 + vpermd %ymm1, %ymm2, %ymm1 + vpmaxsd %ymm4, %ymm14, %ymm4 + vpminsd %ymm6, %ymm10, %ymm12 + vpermd %ymm3, %ymm2, %ymm3 + vmovdqa 136(%rsp), %ymm14 + vmovdqa %ymm4, 40(%rsp) + vpmaxsd %ymm6, %ymm10, %ymm6 + vmovdqa -88(%rsp), %ymm4 + vmovdqa -56(%rsp), %ymm10 + vpermd %ymm6, %ymm2, %ymm6 + vpminsd %ymm10, %ymm5, %ymm9 + vpmaxsd %ymm5, %ymm10, %ymm5 + vpminsd %ymm4, %ymm0, %ymm10 + vpmaxsd %ymm4, %ymm0, %ymm0 + vpermd %ymm9, %ymm2, %ymm9 + vpminsd %ymm14, %ymm11, %ymm4 + vpmaxsd %ymm14, %ymm11, %ymm11 + vpermd %ymm15, %ymm2, %ymm14 + vpermd %ymm4, %ymm2, %ymm4 + vpermd %ymm10, %ymm2, %ymm10 + vpminsd %ymm4, %ymm13, %ymm15 + vpmaxsd %ymm4, %ymm13, %ymm13 + vpminsd %ymm14, %ymm11, %ymm4 + vpmaxsd %ymm14, %ymm11, %ymm11 + vpermd %ymm13, %ymm2, %ymm13 + vmovdqa %ymm4, 136(%rsp) + vpmaxsd %ymm7, %ymm5, %ymm14 + vpminsd %ymm10, %ymm12, %ymm4 + vpermd %ymm11, %ymm2, %ymm11 + vpmaxsd %ymm10, %ymm12, %ymm12 + vpminsd %ymm6, %ymm0, %ymm10 + vpmaxsd %ymm6, %ymm0, %ymm0 + vpminsd %ymm9, %ymm8, %ymm6 + vmovdqa %ymm10, 104(%rsp) + vpermd %ymm12, %ymm2, %ymm12 + vpmaxsd %ymm9, %ymm8, %ymm8 + vpminsd %ymm7, %ymm5, %ymm9 + vmovdqa 72(%rsp), %ymm7 + vpermd %ymm6, %ymm2, %ymm6 + vpermd %ymm9, %ymm2, %ymm9 + vpermd %ymm0, %ymm2, %ymm0 + vpminsd %ymm7, %ymm3, %ymm10 + vpmaxsd %ymm7, %ymm3, %ymm3 + vmovdqa 40(%rsp), %ymm7 + vpermd %ymm10, %ymm2, %ymm10 + vpminsd %ymm7, %ymm1, %ymm5 + vpmaxsd %ymm7, %ymm1, %ymm1 + vpminsd %ymm10, %ymm15, %ymm7 + vpmaxsd %ymm10, %ymm15, %ymm15 + vpermd %ymm5, %ymm2, %ymm5 + vpminsd %ymm6, %ymm4, %ymm10 + vpmaxsd %ymm6, %ymm4, %ymm4 + vpermd %ymm15, %ymm2, %ymm15 + vpminsd %ymm13, %ymm3, %ymm6 + vpmaxsd %ymm13, %ymm3, %ymm3 + vpermd %ymm10, %ymm2, %ymm10 + vpminsd %ymm12, %ymm8, %ymm13 + vpmaxsd %ymm12, %ymm8, %ymm8 + vpermd %ymm3, %ymm2, %ymm3 + vmovdqa %ymm13, 72(%rsp) + vmovdqa 136(%rsp), %ymm13 + vpminsd %ymm13, %ymm5, %ymm12 + vpmaxsd %ymm13, %ymm5, %ymm5 + vmovdqa %ymm12, 40(%rsp) + vmovdqa 104(%rsp), %ymm12 + vpermd %ymm5, %ymm2, %ymm5 + vpminsd %ymm12, %ymm9, %ymm13 + vpmaxsd %ymm12, %ymm9, %ymm9 + vpminsd %ymm11, %ymm1, %ymm12 + vpermd %ymm13, %ymm2, %ymm13 + vpmaxsd %ymm11, %ymm1, %ymm1 + vmovdqa %ymm12, 104(%rsp) + vpminsd %ymm0, %ymm14, %ymm12 + vpmaxsd %ymm0, %ymm14, %ymm0 + vpermd 72(%rsp), %ymm2, %ymm14 + vpminsd %ymm10, %ymm7, %ymm11 + vpmaxsd %ymm10, %ymm7, %ymm7 + vpermd %ymm12, %ymm2, %ymm12 + vmovdqa %ymm0, 136(%rsp) + vmovdqa 40(%rsp), %ymm0 + vpminsd %ymm15, %ymm4, %ymm10 + vpmaxsd %ymm15, %ymm4, %ymm4 + vpermd %ymm1, %ymm2, %ymm1 + vpminsd %ymm14, %ymm6, %ymm15 + vpmaxsd %ymm14, %ymm6, %ymm6 + vpminsd %ymm3, %ymm8, %ymm14 + vpmaxsd %ymm3, %ymm8, %ymm3 + vpminsd %ymm0, %ymm13, %ymm8 + vpmaxsd %ymm0, %ymm13, %ymm13 + vpminsd %ymm5, %ymm9, %ymm0 + vpmaxsd %ymm5, %ymm9, %ymm5 + vmovdqa %ymm0, 72(%rsp) + vmovdqa 104(%rsp), %ymm0 + vpminsd %ymm0, %ymm12, %ymm9 + vpmaxsd %ymm0, %ymm12, %ymm12 + vpminsd 136(%rsp), %ymm1, %ymm0 + vpmaxsd 136(%rsp), %ymm1, %ymm1 + vmovdqa %ymm0, 104(%rsp) + vpermd %ymm11, %ymm2, %ymm0 + vmovdqa %ymm1, 40(%rsp) + vpminsd %ymm0, %ymm11, %ymm1 + vpmaxsd %ymm0, %ymm11, %ymm11 + vpermd %ymm7, %ymm2, %ymm0 + vpblendd $15, %ymm1, %ymm11, %ymm11 + vpminsd %ymm0, %ymm7, %ymm1 + vpmaxsd %ymm0, %ymm7, %ymm7 + vpermd %ymm10, %ymm2, %ymm0 + vpblendd $15, %ymm1, %ymm7, %ymm7 + vpminsd %ymm0, %ymm10, %ymm1 + vpmaxsd %ymm0, %ymm10, %ymm10 + vpermd %ymm4, %ymm2, %ymm0 + vpblendd $15, %ymm1, %ymm10, %ymm10 + vpminsd %ymm0, %ymm4, %ymm1 + vpmaxsd %ymm0, %ymm4, %ymm4 + vpermd %ymm15, %ymm2, %ymm0 + vpblendd $15, %ymm1, %ymm4, %ymm4 + vmovdqa %ymm10, 8(%rsp) + vpminsd %ymm0, %ymm15, %ymm1 + vpmaxsd %ymm0, %ymm15, %ymm15 + vpermd %ymm6, %ymm2, %ymm0 + vpblendd $15, %ymm1, %ymm15, %ymm15 + vpminsd %ymm0, %ymm6, %ymm1 + vpmaxsd %ymm0, %ymm6, %ymm6 + vpermd %ymm14, %ymm2, %ymm0 + vpblendd $15, %ymm1, %ymm6, %ymm10 + vpminsd %ymm0, %ymm14, %ymm1 + vpmaxsd %ymm0, %ymm14, %ymm14 + vpermd %ymm3, %ymm2, %ymm0 + vpblendd $15, %ymm1, %ymm14, %ymm14 + vpminsd %ymm0, %ymm3, %ymm1 + vpmaxsd %ymm0, %ymm3, %ymm3 + vpermd %ymm8, %ymm2, %ymm0 + vmovdqa %ymm14, -24(%rsp) + vpblendd $15, %ymm1, %ymm3, %ymm14 + vpminsd %ymm0, %ymm8, %ymm1 + vpmaxsd %ymm0, %ymm8, %ymm8 + vpermd %ymm13, %ymm2, %ymm0 + vpblendd $15, %ymm1, %ymm8, %ymm8 + vpminsd %ymm0, %ymm13, %ymm1 + vpmaxsd %ymm0, %ymm13, %ymm13 + vmovdqa %ymm8, -56(%rsp) + vmovdqa 72(%rsp), %ymm3 + vpblendd $15, %ymm1, %ymm13, %ymm13 + vmovdqa 40(%rsp), %ymm6 + vpermd %ymm3, %ymm2, %ymm0 + vpminsd %ymm3, %ymm0, %ymm1 + vpmaxsd %ymm3, %ymm0, %ymm0 + vpblendd $15, %ymm1, %ymm0, %ymm3 + vpermd %ymm5, %ymm2, %ymm0 + vpminsd %ymm0, %ymm5, %ymm1 + vpmaxsd %ymm0, %ymm5, %ymm5 + vpermd %ymm9, %ymm2, %ymm0 + vmovdqa %ymm3, 72(%rsp) + vpblendd $15, %ymm1, %ymm5, %ymm8 + vmovdqa 104(%rsp), %ymm5 + vpminsd %ymm0, %ymm9, %ymm1 + vpmaxsd %ymm0, %ymm9, %ymm9 + vpermd %ymm12, %ymm2, %ymm0 + vpblendd $15, %ymm1, %ymm9, %ymm9 + vpminsd %ymm0, %ymm12, %ymm1 + vpmaxsd %ymm0, %ymm12, %ymm12 + vpermd %ymm5, %ymm2, %ymm0 + vmovdqa %ymm9, -88(%rsp) + vpblendd $15, %ymm1, %ymm12, %ymm12 + vpermd %ymm6, %ymm2, %ymm2 + vpminsd %ymm5, %ymm0, %ymm1 + vpmaxsd %ymm5, %ymm0, %ymm0 + vpshufd $78, %ymm10, %ymm9 + vmovdqa %ymm12, 136(%rsp) + vpblendd $15, %ymm1, %ymm0, %ymm5 + vpminsd %ymm6, %ymm2, %ymm0 + vpmaxsd %ymm6, %ymm2, %ymm2 + vpblendd $15, %ymm0, %ymm2, %ymm3 + vpshufd $78, %ymm11, %ymm0 + vpshufd $78, %ymm7, %ymm6 + vmovdqa %ymm5, 104(%rsp) + vpminsd %ymm0, %ymm11, %ymm1 + vpmaxsd %ymm0, %ymm11, %ymm0 + vpshufd $78, %ymm15, %ymm5 + vpblendd $51, %ymm1, %ymm0, %ymm0 + vpminsd %ymm6, %ymm7, %ymm1 + vpmaxsd %ymm6, %ymm7, %ymm6 + vmovdqa 8(%rsp), %ymm7 + vpblendd $51, %ymm1, %ymm6, %ymm6 + vpshufd $78, %ymm14, %ymm11 + vpshufd $78, %ymm7, %ymm12 + vpminsd %ymm7, %ymm12, %ymm1 + vpmaxsd %ymm7, %ymm12, %ymm12 + vpshufd $78, %ymm4, %ymm7 + vpblendd $51, %ymm1, %ymm12, %ymm12 + vpminsd %ymm7, %ymm4, %ymm1 + vpmaxsd %ymm7, %ymm4, %ymm7 + vmovdqa -24(%rsp), %ymm4 + vpblendd $51, %ymm1, %ymm7, %ymm7 + vpminsd %ymm5, %ymm15, %ymm1 + vpmaxsd %ymm5, %ymm15, %ymm5 + vpblendd $51, %ymm1, %ymm5, %ymm5 + vpminsd %ymm9, %ymm10, %ymm1 + vpmaxsd %ymm9, %ymm10, %ymm9 + vpshufd $78, %ymm4, %ymm10 + vpblendd $51, %ymm1, %ymm9, %ymm9 + vpshufd $78, %ymm13, %ymm15 + vpminsd %ymm4, %ymm10, %ymm1 + vpmaxsd %ymm4, %ymm10, %ymm10 + vmovdqa -56(%rsp), %ymm4 + vpblendd $51, %ymm1, %ymm10, %ymm10 + vpminsd %ymm11, %ymm14, %ymm1 + vpmaxsd %ymm11, %ymm14, %ymm11 + vpshufd $78, %ymm4, %ymm14 + vpblendd $51, %ymm1, %ymm11, %ymm11 + vpminsd %ymm4, %ymm14, %ymm1 + vpmaxsd %ymm4, %ymm14, %ymm14 + vmovdqa 72(%rsp), %ymm4 + vpblendd $51, %ymm1, %ymm14, %ymm14 + vpminsd %ymm15, %ymm13, %ymm1 + vpmaxsd %ymm15, %ymm13, %ymm15 + vpshufd $78, %ymm4, %ymm13 + vpblendd $51, %ymm1, %ymm15, %ymm15 + vpminsd %ymm4, %ymm13, %ymm1 + vpmaxsd %ymm4, %ymm13, %ymm13 + vmovdqa -88(%rsp), %ymm4 + vpblendd $51, %ymm1, %ymm13, %ymm13 + vpshufd $78, %ymm8, %ymm1 + vpminsd %ymm1, %ymm8, %ymm2 + vpmaxsd %ymm1, %ymm8, %ymm1 + vpshufd $78, %ymm4, %ymm8 + vpblendd $51, %ymm2, %ymm1, %ymm1 + vpminsd %ymm4, %ymm8, %ymm2 + vpmaxsd %ymm4, %ymm8, %ymm8 + vmovdqa 136(%rsp), %ymm4 + vpblendd $51, %ymm2, %ymm8, %ymm8 + vpshufd $78, %ymm4, %ymm2 + vpminsd %ymm4, %ymm2, %ymm4 + vpmaxsd 136(%rsp), %ymm2, %ymm2 + vpblendd $51, %ymm4, %ymm2, %ymm2 + vmovdqa 104(%rsp), %ymm4 + vmovdqa %ymm2, -56(%rsp) + vpshufd $78, %ymm4, %ymm2 + vpminsd %ymm4, %ymm2, %ymm4 + vpmaxsd 104(%rsp), %ymm2, %ymm2 + vpblendd $51, %ymm4, %ymm2, %ymm4 + vpshufd $78, %ymm3, %ymm2 + vmovdqa %ymm4, -88(%rsp) + vpminsd %ymm2, %ymm3, %ymm4 + vpmaxsd %ymm2, %ymm3, %ymm3 + vpblendd $51, %ymm4, %ymm3, %ymm2 + vpshufd $177, %ymm8, %ymm4 + vmovdqa %ymm2, -120(%rsp) + vpshufd $177, %ymm0, %ymm2 + vpminsd %ymm2, %ymm0, %ymm3 + vpmaxsd %ymm2, %ymm0, %ymm0 + vpblendd $85, %ymm3, %ymm0, %ymm0 + vmovdqa %ymm0, 136(%rsp) + vpshufd $177, %ymm6, %ymm0 + vpminsd %ymm0, %ymm6, %ymm2 + vpmaxsd %ymm0, %ymm6, %ymm6 + vpblendd $85, %ymm2, %ymm6, %ymm0 + vpshufd $177, %ymm14, %ymm6 + vmovdqa %ymm0, 104(%rsp) + vpshufd $177, %ymm12, %ymm0 + vpminsd %ymm0, %ymm12, %ymm2 + vpmaxsd %ymm0, %ymm12, %ymm12 + vpblendd $85, %ymm2, %ymm12, %ymm0 + vmovdqa %ymm0, 72(%rsp) + vpshufd $177, %ymm7, %ymm0 + vpminsd %ymm0, %ymm7, %ymm2 + vpmaxsd %ymm0, %ymm7, %ymm7 + vpshufd $177, %ymm5, %ymm0 + vpblendd $85, %ymm2, %ymm7, %ymm7 + vpminsd %ymm0, %ymm5, %ymm2 + vpmaxsd %ymm0, %ymm5, %ymm5 + vpshufd $177, %ymm9, %ymm0 + vmovdqa %ymm7, 40(%rsp) + vpblendd $85, %ymm2, %ymm5, %ymm7 + vmovdqa -56(%rsp), %ymm5 + vpminsd %ymm0, %ymm9, %ymm2 + vpmaxsd %ymm0, %ymm9, %ymm9 + vpshufd $177, %ymm10, %ymm0 + vmovdqa %ymm7, 8(%rsp) + vpblendd $85, %ymm2, %ymm9, %ymm7 + vpminsd %ymm0, %ymm10, %ymm2 + vpmaxsd %ymm0, %ymm10, %ymm10 + vpshufd $177, %ymm11, %ymm0 + vpblendd $85, %ymm2, %ymm10, %ymm10 + vmovdqa %ymm7, -24(%rsp) + vpshufd $177, %ymm5, %ymm7 + vpminsd %ymm0, %ymm11, %ymm2 + vpmaxsd %ymm0, %ymm11, %ymm11 + vpblendd $85, %ymm2, %ymm11, %ymm11 + vpminsd %ymm6, %ymm14, %ymm0 + vpshufd $177, %ymm15, %ymm2 + vpmaxsd %ymm6, %ymm14, %ymm14 + vpminsd %ymm2, %ymm15, %ymm3 + vmovdqa -120(%rsp), %ymm6 + vpmaxsd %ymm2, %ymm15, %ymm15 + vpblendd $85, %ymm0, %ymm14, %ymm14 + vpshufd $177, %ymm13, %ymm0 + vpminsd %ymm0, %ymm13, %ymm2 + vpblendd $85, %ymm3, %ymm15, %ymm15 + vpmaxsd %ymm0, %ymm13, %ymm13 + vmovdqa -88(%rsp), %ymm3 + vpshufd $177, %ymm1, %ymm0 + vpblendd $85, %ymm2, %ymm13, %ymm13 + vpminsd %ymm0, %ymm1, %ymm2 + vpmaxsd %ymm0, %ymm1, %ymm1 + vpshufd $177, %ymm3, %ymm9 + vpminsd %ymm4, %ymm8, %ymm0 + vpmaxsd %ymm4, %ymm8, %ymm4 + vpblendd $85, %ymm2, %ymm1, %ymm1 + vpblendd $85, %ymm0, %ymm4, %ymm4 + vpminsd %ymm5, %ymm7, %ymm0 + vpmaxsd %ymm5, %ymm7, %ymm7 + vpblendd $85, %ymm0, %ymm7, %ymm7 + vpshufd $177, %ymm6, %ymm5 + vpminsd %ymm3, %ymm9, %ymm0 + vpmaxsd %ymm3, %ymm9, %ymm9 + vpblendd $85, %ymm0, %ymm9, %ymm3 + vpminsd %ymm6, %ymm5, %ymm0 + vpmaxsd %ymm6, %ymm5, %ymm5 + vpblendd $85, %ymm0, %ymm5, %ymm12 +.L1774: + vmovdqa 136(%rsp), %ymm6 + vmovdqa 104(%rsp), %ymm5 + movq 176(%rsp), %rdx + vmovdqu %ymm6, (%rdx) + vmovdqa 72(%rsp), %ymm6 + vmovdqu %ymm5, (%r15) + vmovdqa 40(%rsp), %ymm5 + vmovdqu %ymm6, (%r14) + vmovdqa 8(%rsp), %ymm6 + vmovdqu %ymm5, 0(%r13) + vmovdqa -24(%rsp), %ymm5 + vmovdqu %ymm6, (%r12) + vmovdqu %ymm5, (%rbx) + movq 192(%rsp), %rbx + vmovdqu %ymm10, (%r11) + vmovdqu %ymm11, (%r10) + vmovdqu %ymm14, (%r9) + vmovdqu %ymm15, (%r8) + vmovdqu %ymm13, (%rdi) + vmovdqu %ymm1, (%rsi) + vmovdqu %ymm4, (%rcx) + movq 184(%rsp), %rcx + vmovdqu %ymm7, (%rcx) + vmovdqu %ymm3, (%rbx) + vmovdqu %ymm12, (%rax) + vzeroupper + leaq -40(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1778: + .cfi_restore_state + vmovdqa 72(%rsp), %ymm6 + vmovdqa 104(%rsp), %ymm12 + vmovdqa 8(%rsp), %ymm3 + vmovdqa %ymm2, 8(%rsp) + vmovdqa %ymm6, 136(%rsp) + vmovdqa 40(%rsp), %ymm6 + vmovdqa %ymm9, 40(%rsp) + vmovdqa %ymm6, 104(%rsp) + vmovdqa -24(%rsp), %ymm6 + vmovdqa %ymm5, -24(%rsp) + vmovdqa %ymm6, 72(%rsp) + jmp .L1774 + .p2align 4,,10 + .p2align 3 +.L1779: + vmovdqa 8(%rsp), %ymm3 + vmovdqa %ymm13, %ymm11 + vmovdqa %ymm8, %ymm4 + vmovdqa %ymm0, %ymm13 + vmovdqa %ymm10, 136(%rsp) + vmovdqa %ymm5, %ymm12 + vmovdqa -88(%rsp), %ymm10 + vmovdqa %ymm3, 104(%rsp) + vmovdqa -24(%rsp), %ymm3 + vmovdqa %ymm15, 72(%rsp) + vmovdqa %ymm2, %ymm15 + vmovdqa %ymm3, 40(%rsp) + vmovdqa -56(%rsp), %ymm3 + vmovdqa %ymm14, 8(%rsp) + vmovdqa %ymm6, %ymm14 + vmovdqa %ymm3, -24(%rsp) + vmovdqa %ymm9, %ymm3 + jmp .L1774 + .p2align 4,,10 + .p2align 3 +.L1780: + vmovdqa 40(%rsp), %ymm6 + vmovdqa %ymm5, 40(%rsp) + vmovdqa -24(%rsp), %ymm5 + vmovdqa %ymm9, %ymm3 + vmovdqa %ymm10, 104(%rsp) + vmovdqa -88(%rsp), %ymm10 + vmovdqa %ymm6, 136(%rsp) + vmovdqa 8(%rsp), %ymm6 + vmovdqa %ymm5, 8(%rsp) + vmovdqa -56(%rsp), %ymm5 + vmovdqa %ymm6, 72(%rsp) + vmovdqa %ymm5, -24(%rsp) + jmp .L1774 + .cfi_endproc +.LFE18808: + .size _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0, .-_ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + .section .text._ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0, @function +_ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0: +.LFB18809: + .cfi_startproc + leaq 8(%rsp), %r10 + .cfi_def_cfa 10, 0 + andq $-32, %rsp + pushq -8(%r10) + pushq %rbp + movq %rsp, %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + pushq %r15 + pushq %r14 + pushq %r13 + .cfi_escape 0x10,0xf,0x2,0x76,0x78 + .cfi_escape 0x10,0xe,0x2,0x76,0x70 + .cfi_escape 0x10,0xd,0x2,0x76,0x68 + movq %rdi, %r13 + pushq %r12 + .cfi_escape 0x10,0xc,0x2,0x76,0x60 + movq %rcx, %r12 + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x58,0x6 + pushq %rbx + addq $-128, %rsp + .cfi_escape 0x10,0x3,0x2,0x76,0x50 + movq %rsi, -104(%rbp) + movq %rdx, -88(%rbp) + movq %r9, -96(%rbp) + cmpq $128, %rdx + jbe .L1942 + movq %rdi, %rax + movq %rdi, -112(%rbp) + movq %r8, %rbx + shrq $2, %rax + movq %rax, %rdx + movq %rax, -144(%rbp) + andl $15, %edx + jne .L1943 + movq -88(%rbp), %r14 + movq %rdi, %rax +.L1794: + movq 8(%rbx), %rsi + movq 16(%rbx), %r11 + movq %rsi, %rdi + leaq 1(%r11), %r9 + leaq (%rsi,%rsi,8), %rcx + xorq (%rbx), %r9 + shrq $11, %rdi + rorx $40, %rsi, %rdx + leaq 2(%r11), %rsi + addq %r9, %rdx + xorq %rdi, %rcx + movq %rdx, %r8 + rorx $40, %rdx, %rdi + xorq %rsi, %rcx + shrq $11, %r8 + leaq (%rdx,%rdx,8), %rsi + leaq 3(%r11), %rdx + addq %rcx, %rdi + xorq %r8, %rsi + movq %rdi, %r8 + xorq %rdx, %rsi + leaq (%rdi,%rdi,8), %rdx + rorx $40, %rdi, %r15 + shrq $11, %r8 + leaq 4(%r11), %rdi + addq %rsi, %r15 + addq $5, %r11 + xorq %r8, %rdx + rorx $40, %r15, %r10 + leaq (%r15,%r15,8), %r8 + movq %r11, 16(%rbx) + xorq %rdi, %rdx + movq %r15, %rdi + addq %rdx, %r10 + shrq $11, %rdi + movq %r10, %r15 + xorq %rdi, %r8 + leaq (%r10,%r10,8), %rdi + rorx $40, %r10, %r10 + shrq $11, %r15 + xorq %r11, %r8 + movl %esi, %r11d + xorq %r15, %rdi + addq %r8, %r10 + movl %ecx, %r15d + movl %r8d, %r8d + vmovq %rdi, %xmm5 + movq %r14, %rdi + vpinsrq $1, %r10, %xmm5, %xmm0 + shrq $4, %rdi + movabsq $68719476719, %r10 + cmpq %r10, %r14 + movl $4294967295, %r10d + movl %r9d, %r14d + vmovdqu %xmm0, (%rbx) + cmova %r10, %rdi + shrq $32, %r9 + movl %edx, %r10d + shrq $32, %rcx + imulq %rdi, %r15 + shrq $32, %rsi + shrq $32, %rdx + imulq %rdi, %r8 + imulq %rdi, %r14 + shrq $32, %r15 + imulq %rdi, %r9 + imulq %rdi, %rcx + imulq %rdi, %r11 + shrq $32, %r14 + imulq %rdi, %rsi + shrq $32, %r9 + imulq %rdi, %r10 + shrq $32, %rcx + imulq %rdi, %rdx + movq %r8, %rdi + movq %r15, %r8 + shrq $32, %r11 + salq $6, %r8 + shrq $32, %rsi + vmovdqa (%rax,%r8), %ymm1 + shrq $32, %rdi + movq %r14, %r8 + shrq $32, %r10 + salq $6, %r8 + shrq $32, %rdx + vmovdqa (%rax,%r8), %ymm0 + movq %r9, %r8 + salq $4, %r15 + salq $6, %r8 + vmovdqa 32(%rax,%r15,4), %ymm3 + vpminsd %ymm1, %ymm0, %ymm2 + vpmaxsd (%rax,%r8), %ymm2, %ymm2 + movq %rsi, %r8 + salq $6, %r8 + vpmaxsd %ymm1, %ymm0, %ymm0 + vmovdqa (%rax,%r8), %ymm1 + movq %rcx, %r8 + vpminsd %ymm0, %ymm2, %ymm2 + salq $6, %r8 + vmovdqa %ymm2, (%r12) + vmovdqa (%rax,%r8), %ymm0 + movq %r11, %r8 + salq $6, %r8 + vpminsd %ymm1, %ymm0, %ymm7 + vpmaxsd (%rax,%r8), %ymm7, %ymm7 + movq %rdi, %r8 + salq $6, %r8 + vpmaxsd %ymm1, %ymm0, %ymm0 + vmovdqa (%rax,%r8), %ymm1 + movq %r10, %r8 + vpminsd %ymm0, %ymm7, %ymm7 + salq $6, %r8 + vmovdqa %ymm7, 64(%r12) + vmovdqa (%rax,%r8), %ymm0 + movq %rdx, %r8 + salq $6, %r8 + salq $4, %r14 + vpminsd %ymm1, %ymm0, %ymm6 + vpmaxsd (%rax,%r8), %ymm6, %ymm6 + salq $4, %r9 + salq $4, %rsi + vpmaxsd %ymm1, %ymm0, %ymm0 + salq $4, %rcx + vmovdqa 32(%rax,%rsi,4), %ymm4 + salq $4, %r11 + vpminsd %ymm0, %ymm6, %ymm6 + vmovdqa 32(%rax,%r14,4), %ymm0 + salq $4, %rdi + salq $4, %r10 + salq $4, %rdx + leaq 192(%r12), %r14 + vmovdqa %ymm6, 128(%r12) + vpminsd %ymm3, %ymm0, %ymm1 + vpmaxsd 32(%rax,%r9,4), %ymm1, %ymm1 + vpmaxsd %ymm3, %ymm0, %ymm0 + vpminsd %ymm0, %ymm1, %ymm1 + vmovdqa 32(%rax,%rcx,4), %ymm0 + vmovdqa %ymm1, 32(%r12) + vpminsd %ymm4, %ymm0, %ymm3 + vpmaxsd 32(%rax,%r11,4), %ymm3, %ymm3 + vpmaxsd %ymm4, %ymm0, %ymm0 + vmovdqa 32(%rax,%rdi,4), %ymm4 + vpminsd %ymm0, %ymm3, %ymm3 + vmovdqa 32(%rax,%r10,4), %ymm0 + vmovdqa %ymm3, 96(%r12) + vpminsd %ymm4, %ymm0, %ymm5 + vpmaxsd 32(%rax,%rdx,4), %ymm5, %ymm5 + vpmaxsd %ymm4, %ymm0, %ymm0 + vpminsd %ymm0, %ymm5, %ymm5 + vpbroadcastd %xmm2, %ymm0 + vpxor %ymm2, %ymm0, %ymm2 + vpxor %ymm1, %ymm0, %ymm1 + vpxor %ymm7, %ymm0, %ymm7 + vmovdqa %ymm5, 160(%r12) + vpor %ymm2, %ymm1, %ymm1 + vpxor %ymm3, %ymm0, %ymm3 + vpxor %ymm6, %ymm0, %ymm6 + vpor %ymm7, %ymm1, %ymm1 + vpxor %ymm5, %ymm0, %ymm5 + vmovdqa %ymm0, %ymm4 + vpxor 192(%r12), %ymm0, %ymm2 + vpor %ymm3, %ymm1, %ymm1 + vpxor %xmm3, %xmm3, %xmm3 + vpor %ymm6, %ymm1, %ymm1 + vpor %ymm5, %ymm1, %ymm1 + vpor %ymm1, %ymm2, %ymm2 + vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 + vpxor %xmm2, %xmm2, %xmm2 + vpcmpeqd %ymm2, %ymm1, %ymm1 + vmovmskps %ymm1, %eax + cmpl $255, %eax + je .L1796 + vmovdqa .LC14(%rip), %ymm0 + movl $4, %esi + movq %r12, %rdi + vmovdqu %ymm0, 192(%r12) + vmovdqu %ymm0, 224(%r12) + vmovdqu %ymm0, 256(%r12) + vzeroupper + call _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + vpbroadcastd (%r12), %ymm0 + vpcmpeqd %ymm2, %ymm2, %ymm2 + vpbroadcastd 188(%r12), %ymm1 + vpaddd %ymm2, %ymm1, %ymm2 + vpcmpeqd %ymm0, %ymm2, %ymm2 + vmovmskps %ymm2, %eax + cmpl $255, %eax + jne .L1798 + movq -88(%rbp), %rsi + leaq -80(%rbp), %rdx + movq %r14, %rcx + movq %r13, %rdi + call _ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L1937 +.L1798: + movl 96(%r12), %ecx + movl $23, %eax + movl $24, %edx + cmpl 92(%r12), %ecx + je .L1832 + jmp .L1837 + .p2align 4,,10 + .p2align 3 +.L1835: + testq %rax, %rax + je .L1944 +.L1832: + movq %rax, %rdx + subq $1, %rax + movl (%r12,%rax,4), %esi + cmpl %esi, %ecx + je .L1835 + cmpl %ecx, (%r12,%rdx,4) + je .L1837 + movl %esi, %ecx + jmp .L1834 + .p2align 4,,10 + .p2align 3 +.L1838: + cmpq $47, %rdx + je .L1940 +.L1837: + movq %rdx, %rsi + addq $1, %rdx + cmpl (%r12,%rdx,4), %ecx + je .L1838 + movl $24, %edx + subq $23, %rsi + subq %rax, %rdx + cmpq %rdx, %rsi + jb .L1834 +.L1940: + movl (%r12,%rax,4), %ecx +.L1834: + vmovd %ecx, %xmm0 + vpbroadcastd %xmm0, %ymm0 +.L1941: + movl $1, -112(%rbp) +.L1831: + cmpq $0, -96(%rbp) + je .L1945 + movq -88(%rbp), %rax + vmovdqa %ymm0, %ymm1 + leaq -8(%rax), %r10 + movq %r10, %rdx + movq %r10, %rsi + vmovdqu 0(%r13,%r10,4), %ymm5 + andl $31, %edx + andl $24, %esi + je .L1880 + vmovdqu 0(%r13), %ymm3 + vpcmpeqd %ymm4, %ymm4, %ymm4 + leaq _ZZN3hwy6N_AVX26detail18IndicesFromBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array(%rip), %r9 + xorl %ecx, %ecx + vpcmpgtd %ymm0, %ymm3, %ymm6 + vpxor %ymm4, %ymm6, %ymm0 + vmovmskps %ymm6, %esi + vmovmskps %ymm0, %eax + vmovdqa .LC17(%rip), %ymm0 + popcntq %rsi, %rcx + vpbroadcastd (%r9,%rax,4), %ymm2 + popcntq %rax, %rax + leaq 0(%r13,%rax,4), %rax + vpsrlvd %ymm0, %ymm2, %ymm2 + vpslld $28, %ymm2, %ymm7 + vpermd %ymm3, %ymm2, %ymm2 + vpmaskmovd %ymm2, %ymm7, 0(%r13) + vpbroadcastd (%r9,%rsi,4), %ymm2 + vpsrlvd %ymm0, %ymm2, %ymm2 + vpermd %ymm3, %ymm2, %ymm3 + vmovdqu %ymm3, (%r12) + testb $16, %r10b + je .L1842 + vmovdqu 32(%r13), %ymm3 + vpcmpgtd %ymm1, %ymm3, %ymm6 + vpxor %ymm4, %ymm6, %ymm2 + vmovmskps %ymm2, %esi + vpbroadcastd (%r9,%rsi,4), %ymm2 + popcntq %rsi, %rsi + vpsrlvd %ymm0, %ymm2, %ymm2 + vpslld $28, %ymm2, %ymm7 + vpermd %ymm3, %ymm2, %ymm2 + vpmaskmovd %ymm2, %ymm7, (%rax) + leaq (%rax,%rsi,4), %rax + vmovmskps %ymm6, %esi + vpbroadcastd (%r9,%rsi,4), %ymm2 + popcntq %rsi, %rsi + vpsrlvd %ymm0, %ymm2, %ymm2 + vpermd %ymm3, %ymm2, %ymm3 + vmovdqu %ymm3, (%r12,%rcx,4) + addq %rsi, %rcx + cmpq $23, %rdx + jbe .L1842 + vmovdqu 64(%r13), %ymm3 + vpcmpgtd %ymm1, %ymm3, %ymm6 + vpxor %ymm4, %ymm6, %ymm2 + vmovmskps %ymm2, %esi + vpbroadcastd (%r9,%rsi,4), %ymm2 + popcntq %rsi, %rsi + vpsrlvd %ymm0, %ymm2, %ymm2 + vpslld $28, %ymm2, %ymm4 + vpermd %ymm3, %ymm2, %ymm2 + vpmaskmovd %ymm2, %ymm4, (%rax) + leaq (%rax,%rsi,4), %rax + vmovmskps %ymm6, %esi + vpbroadcastd (%r9,%rsi,4), %ymm2 + popcntq %rsi, %rsi + vpsrlvd %ymm0, %ymm2, %ymm2 + vpermd %ymm3, %ymm2, %ymm3 + vmovdqu %ymm3, (%r12,%rcx,4) + addq %rsi, %rcx +.L1842: + leaq -8(%rdx), %rsi + leaq 1(%rdx), %rdi + andq $-8, %rsi + leaq 0(,%rcx,4), %r8 + addq $8, %rsi + cmpq $8, %rdi + movl $8, %edi + cmovbe %rdi, %rsi +.L1841: + cmpq %rdx, %rsi + je .L1843 + subq %rsi, %rdx + vmovdqu 0(%r13,%rsi,4), %ymm4 + vmovd %edx, %xmm2 + vpbroadcastd %xmm2, %ymm2 + vpcmpgtd %ymm1, %ymm4, %ymm6 + vpcmpgtd .LC3(%rip), %ymm2, %ymm2 + vpandn %ymm2, %ymm6, %ymm3 + vpand %ymm2, %ymm6, %ymm6 + vmovmskps %ymm3, %edx + vpbroadcastd (%r9,%rdx,4), %ymm3 + popcntq %rdx, %rdx + vpsrlvd %ymm0, %ymm3, %ymm3 + vpslld $28, %ymm3, %ymm7 + vpermd %ymm4, %ymm3, %ymm3 + vpmaskmovd %ymm3, %ymm7, (%rax) + leaq (%rax,%rdx,4), %rax + vmovmskps %ymm6, %edx + vpbroadcastd (%r9,%rdx,4), %ymm2 + popcntq %rdx, %rdx + addq %rdx, %rcx + vpsrlvd %ymm0, %ymm2, %ymm2 + vpermd %ymm4, %ymm2, %ymm4 + vmovdqu %ymm4, (%r12,%r8) + leaq 0(,%rcx,4), %r8 +.L1843: + movq %r10, %rdx + subq %rcx, %rdx + movl %r8d, %ecx + leaq 0(%r13,%rdx,4), %r11 + cmpl $8, %r8d + jnb .L1844 + testb $4, %r8b + jne .L1946 + testl %ecx, %ecx + jne .L1947 +.L1845: + movl %r8d, %ecx + cmpl $8, %r8d + jnb .L1848 + andl $4, %r8d + jne .L1948 + testl %ecx, %ecx + jne .L1949 +.L1849: + movq %rax, %rcx + subq %r13, %rcx + sarq $2, %rcx + subq %rcx, %r10 + subq %rcx, %rdx + movq %rcx, %r15 + movq %r10, -144(%rbp) + leaq (%rax,%rdx,4), %rcx + je .L1881 + leaq 128(%rax), %rsi + leaq -128(%rcx), %r8 + vmovdqu (%rax), %ymm13 + vmovdqu 32(%rax), %ymm12 + vmovdqu 64(%rax), %ymm11 + vmovdqu 96(%rax), %ymm10 + vmovdqu -128(%rcx), %ymm9 + vmovdqu -96(%rcx), %ymm8 + vmovdqu -64(%rcx), %ymm7 + vmovdqu -32(%rcx), %ymm6 + cmpq %r8, %rsi + je .L1882 + xorl %ecx, %ecx + leaq _ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array(%rip), %rdi + movl $8, %r10d + jmp .L1856 + .p2align 4,,10 + .p2align 3 +.L1951: + vmovdqu -128(%r8), %ymm14 + vmovdqu -96(%r8), %ymm4 + prefetcht0 -512(%r8) + addq $-128, %r8 + vmovdqu 64(%r8), %ymm3 + vmovdqu 96(%r8), %ymm2 +.L1855: + vpcmpgtd %ymm1, %ymm14, %ymm15 + leaq -8(%rdx,%rcx), %r14 + vmovmskps %ymm15, %r11d + vpbroadcastd (%rdi,%r11,4), %ymm15 + popcntq %r11, %r11 + vpsrlvd %ymm0, %ymm15, %ymm15 + vpermd %ymm14, %ymm15, %ymm14 + vmovdqu %ymm14, (%rax,%rcx,4) + addq $8, %rcx + vmovdqu %ymm14, (%rax,%r14,4) + vpcmpgtd %ymm1, %ymm4, %ymm14 + subq %r11, %rcx + leaq -16(%rdx,%rcx), %r14 + vmovmskps %ymm14, %r11d + vpbroadcastd (%rdi,%r11,4), %ymm14 + popcntq %r11, %r11 + vpsrlvd %ymm0, %ymm14, %ymm14 + vpermd %ymm4, %ymm14, %ymm4 + vmovdqu %ymm4, (%rax,%rcx,4) + vmovdqu %ymm4, (%rax,%r14,4) + vpcmpgtd %ymm1, %ymm3, %ymm4 + movq %r10, %r14 + subq %r11, %r14 + addq %r14, %rcx + vmovmskps %ymm4, %r11d + leaq -24(%rdx,%rcx), %r14 + subq $32, %rdx + vpbroadcastd (%rdi,%r11,4), %ymm4 + popcntq %r11, %r11 + vpsrlvd %ymm0, %ymm4, %ymm4 + vpermd %ymm3, %ymm4, %ymm3 + vmovdqu %ymm3, (%rax,%rcx,4) + vmovdqu %ymm3, (%rax,%r14,4) + vpcmpgtd %ymm1, %ymm2, %ymm3 + movq %r10, %r14 + subq %r11, %r14 + leaq (%r14,%rcx), %r11 + vmovmskps %ymm3, %ecx + leaq (%r11,%rdx), %r14 + vpbroadcastd (%rdi,%rcx,4), %ymm3 + popcntq %rcx, %rcx + vpsrlvd %ymm0, %ymm3, %ymm3 + vpermd %ymm2, %ymm3, %ymm2 + vmovdqu %ymm2, (%rax,%r11,4) + vmovdqu %ymm2, (%rax,%r14,4) + movq %r10, %r14 + subq %rcx, %r14 + leaq (%r14,%r11), %rcx + cmpq %r8, %rsi + je .L1950 +.L1856: + movq %rsi, %r11 + subq %rax, %r11 + sarq $2, %r11 + subq %rcx, %r11 + cmpq $32, %r11 + ja .L1951 + vmovdqu (%rsi), %ymm14 + vmovdqu 32(%rsi), %ymm4 + prefetcht0 512(%rsi) + subq $-128, %rsi + vmovdqu -64(%rsi), %ymm3 + vmovdqu -32(%rsi), %ymm2 + jmp .L1855 + .p2align 4,,10 + .p2align 3 +.L1943: + movl $16, %eax + subq %rdx, %rax + leaq (%rdi,%rax,4), %rax + movq -88(%rbp), %rdi + leaq -16(%rdx,%rdi), %r14 + jmp .L1794 + .p2align 4,,10 + .p2align 3 +.L1950: + leaq (%rdx,%rcx), %rsi + leaq (%rax,%rcx,4), %r8 + addq $8, %rcx +.L1853: + vpcmpgtd %ymm1, %ymm13, %ymm2 + vmovmskps %ymm2, %r10d + vpbroadcastd (%rdi,%r10,4), %ymm2 + popcntq %r10, %r10 + subq %r10, %rcx + vpsrlvd %ymm0, %ymm2, %ymm2 + vpermd %ymm13, %ymm2, %ymm13 + vpcmpgtd %ymm1, %ymm12, %ymm2 + vmovdqu %ymm13, (%r8) + leaq -16(%rdx,%rcx), %r8 + vmovdqu %ymm13, -32(%rax,%rsi,4) + vmovmskps %ymm2, %esi + vpbroadcastd (%rdi,%rsi,4), %ymm2 + popcntq %rsi, %rsi + vpsrlvd %ymm0, %ymm2, %ymm2 + vpermd %ymm12, %ymm2, %ymm12 + vpcmpgtd %ymm1, %ymm11, %ymm2 + vmovdqu %ymm12, (%rax,%rcx,4) + subq %rsi, %rcx + vmovdqu %ymm12, (%rax,%r8,4) + addq $8, %rcx + vmovmskps %ymm2, %r8d + leaq -24(%rdx,%rcx), %rsi + vpbroadcastd (%rdi,%r8,4), %ymm2 + popcntq %r8, %r8 + vpsrlvd %ymm0, %ymm2, %ymm2 + vpermd %ymm11, %ymm2, %ymm11 + vpcmpgtd %ymm1, %ymm10, %ymm2 + vmovdqu %ymm11, (%rax,%rcx,4) + vmovdqu %ymm11, (%rax,%rsi,4) + movl $8, %esi + movq %rsi, %r10 + subq %r8, %r10 + vmovmskps %ymm2, %r8d + vpbroadcastd (%rdi,%r8,4), %ymm2 + addq %r10, %rcx + popcntq %r8, %r8 + leaq -32(%rdx,%rcx), %r10 + vpsrlvd %ymm0, %ymm2, %ymm2 + vpermd %ymm10, %ymm2, %ymm10 + vpcmpgtd %ymm1, %ymm9, %ymm2 + vmovdqu %ymm10, (%rax,%rcx,4) + vmovdqu %ymm10, (%rax,%r10,4) + movq %rsi, %r10 + subq %r8, %r10 + leaq (%r10,%rcx), %r8 + vmovmskps %ymm2, %ecx + vpbroadcastd (%rdi,%rcx,4), %ymm2 + leaq -40(%rdx,%r8), %r10 + popcntq %rcx, %rcx + vpsrlvd %ymm0, %ymm2, %ymm2 + vpermd %ymm9, %ymm2, %ymm9 + vpcmpgtd %ymm1, %ymm8, %ymm2 + vmovdqu %ymm9, (%rax,%r8,4) + vmovdqu %ymm9, (%rax,%r10,4) + movq %rsi, %r10 + subq %rcx, %r10 + leaq (%r10,%r8), %rcx + vmovmskps %ymm2, %r8d + vpbroadcastd (%rdi,%r8,4), %ymm2 + leaq -48(%rdx,%rcx), %r10 + popcntq %r8, %r8 + vpsrlvd %ymm0, %ymm2, %ymm2 + vpermd %ymm8, %ymm2, %ymm8 + vpcmpgtd %ymm1, %ymm7, %ymm2 + vmovdqu %ymm8, (%rax,%rcx,4) + vmovdqu %ymm8, (%rax,%r10,4) + movq %rsi, %r10 + subq %r8, %r10 + leaq (%r10,%rcx), %r8 + vmovmskps %ymm2, %ecx + vpbroadcastd (%rdi,%rcx,4), %ymm2 + leaq -56(%rdx,%r8), %r10 + vpsrlvd %ymm0, %ymm2, %ymm2 + vpermd %ymm7, %ymm2, %ymm7 + vpcmpgtd %ymm1, %ymm6, %ymm2 + vmovdqu %ymm7, (%rax,%r8,4) + vmovdqu %ymm7, (%rax,%r10,4) + xorl %r10d, %r10d + popcntq %rcx, %r10 + movq %rsi, %rcx + subq %r10, %rcx + addq %r8, %rcx + vmovmskps %ymm2, %r8d + vpbroadcastd (%rdi,%r8,4), %ymm2 + leaq -64(%rdx,%rcx), %rdx + popcntq %r8, %r8 + movq -144(%rbp), %rdi + subq %r8, %rsi + vpsrlvd %ymm0, %ymm2, %ymm2 + vpermd %ymm6, %ymm2, %ymm6 + vmovdqu %ymm6, (%rax,%rcx,4) + vmovdqu %ymm6, (%rax,%rdx,4) + leaq (%rsi,%rcx), %rdx + subq %rdx, %rdi + leaq (%rax,%rdx,4), %rcx +.L1852: + movq %rcx, %rsi + cmpq $7, %rdi + ja .L1857 + movq -144(%rbp), %rdi + leaq -32(%rax,%rdi,4), %rsi +.L1857: + vpcmpgtd %ymm1, %ymm5, %ymm1 + vpcmpeqd %ymm2, %ymm2, %ymm2 + vmovdqu (%rsi), %ymm7 + movq -144(%rbp), %rdi + vmovdqa %ymm7, -176(%rbp) + vpxor %ymm2, %ymm1, %ymm2 + vmovdqu %ymm7, (%rax,%rdi,4) + vmovmskps %ymm2, %esi + vpbroadcastd (%r9,%rsi,4), %ymm2 + popcntq %rsi, %rsi + addq %rsi, %rdx + leaq (%r15,%rdx), %r14 + vpsrlvd %ymm0, %ymm2, %ymm2 + vpslld $28, %ymm2, %ymm3 + vpermd %ymm5, %ymm2, %ymm2 + vpmaskmovd %ymm2, %ymm3, (%rcx) + vmovmskps %ymm1, %ecx + vpbroadcastd (%r9,%rcx,4), %ymm1 + vpsrlvd %ymm0, %ymm1, %ymm0 + vpslld $28, %ymm0, %ymm1 + vpermd %ymm5, %ymm0, %ymm5 + vpmaskmovd %ymm5, %ymm1, (%rax,%rdx,4) + movq -96(%rbp), %r15 + subq $1, %r15 + cmpl $2, -112(%rbp) + je .L1952 + movq -104(%rbp), %rsi + movq %r15, %r9 + movq %rbx, %r8 + movq %r12, %rcx + movq %r14, %rdx + movq %r13, %rdi + vzeroupper + call _ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + cmpl $3, -112(%rbp) + je .L1937 +.L1859: + movq -88(%rbp), %rdx + movq -104(%rbp), %rsi + leaq 0(%r13,%r14,4), %rdi + movq %r15, %r9 + movq %rbx, %r8 + movq %r12, %rcx + subq %r14, %rdx + call _ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1937: + subq $-128, %rsp + popq %rbx + popq %r10 + .cfi_remember_state + .cfi_def_cfa 10, 0 + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + leaq -8(%r10), %rsp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1848: + .cfi_restore_state + movq (%r12), %rcx + leaq 8(%r11), %rdi + andq $-8, %rdi + movq %rcx, (%r11) + movl %r8d, %ecx + movq -8(%r12,%rcx), %rsi + movq %rsi, -8(%r11,%rcx) + subq %rdi, %r11 + movq %r12, %rsi + leal (%r8,%r11), %ecx + subq %r11, %rsi + shrl $3, %ecx + rep movsq + jmp .L1849 + .p2align 4,,10 + .p2align 3 +.L1844: + movq (%r11), %rcx + leaq 8(%rax), %rdi + andq $-8, %rdi + movq %rcx, (%rax) + movl %r8d, %ecx + movq -8(%r11,%rcx), %rsi + movq %rsi, -8(%rax,%rcx) + movq %rax, %rcx + movq %r11, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r8d, %ecx + shrl $3, %ecx + rep movsq + jmp .L1845 + .p2align 4,,10 + .p2align 3 +.L1944: + movl (%r12), %ecx + jmp .L1834 + .p2align 4,,10 + .p2align 3 +.L1949: + movzbl (%r12), %esi + movb %sil, (%r11) + testb $2, %cl + je .L1849 + movzwl -2(%r12,%rcx), %esi + movw %si, -2(%r11,%rcx) + jmp .L1849 + .p2align 4,,10 + .p2align 3 +.L1947: + movzbl (%r11), %esi + movb %sil, (%rax) + testb $2, %cl + je .L1845 + movzwl -2(%r11,%rcx), %esi + movw %si, -2(%rax,%rcx) + jmp .L1845 + .p2align 4,,10 + .p2align 3 +.L1942: + cmpq $1, %rdx + jbe .L1937 + leaq 512(%rdi), %rax + cmpq %rax, %rsi + jb .L1953 + movl $8, %esi + call _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + jmp .L1937 + .p2align 4,,10 + .p2align 3 +.L1796: + movq -144(%rbp), %rax + movl $8, %edi + vmovdqa .LC3(%rip), %ymm5 + vpcmpeqd 0(%r13), %ymm0, %ymm1 + andl $7, %eax + subq %rax, %rdi + vmovd %edi, %xmm2 + vpbroadcastd %xmm2, %ymm2 + vpcmpgtd %ymm5, %ymm2, %ymm2 + vpandn %ymm2, %ymm1, %ymm1 + vmovmskps %ymm1, %eax + testl %eax, %eax + jne .L1954 + vpxor %xmm2, %xmm2, %xmm2 + movq -88(%rbp), %r8 + leaq 512(%r13,%rdi,4), %rsi + vpxor %xmm6, %xmm6, %xmm6 + vmovdqa %ymm2, %ymm1 + .p2align 4,,10 + .p2align 3 +.L1802: + movq %rdi, %rcx + leaq 128(%rdi), %rdi + cmpq %rdi, %r8 + jb .L1955 + leaq -512(%rsi), %rax +.L1801: + vpxor (%rax), %ymm4, %ymm3 + leaq 64(%rax), %rdx + vpor %ymm1, %ymm3, %ymm1 + vpxor 32(%rax), %ymm4, %ymm3 + vpor %ymm2, %ymm3, %ymm2 + vpxor 64(%rax), %ymm4, %ymm3 + vpor %ymm1, %ymm3, %ymm1 + vpxor 96(%rax), %ymm4, %ymm3 + vpor %ymm2, %ymm3, %ymm2 + vpxor 128(%rax), %ymm4, %ymm3 + vpor %ymm1, %ymm3, %ymm1 + vpxor 160(%rax), %ymm4, %ymm3 + leaq 192(%rdx), %rax + vpor %ymm2, %ymm3, %ymm2 + vpxor 128(%rdx), %ymm4, %ymm3 + vpor %ymm1, %ymm3, %ymm1 + vpxor 160(%rdx), %ymm4, %ymm3 + vpor %ymm2, %ymm3, %ymm2 + cmpq %rsi, %rax + jne .L1801 + vpor %ymm2, %ymm1, %ymm3 + leaq 704(%rdx), %rsi + vpcmpeqd %ymm6, %ymm3, %ymm3 + vmovmskps %ymm3, %eax + cmpl $255, %eax + je .L1802 + vpcmpeqd 0(%r13,%rcx,4), %ymm0, %ymm1 + vpcmpeqd %ymm2, %ymm2, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vmovmskps %ymm1, %eax + testl %eax, %eax + jne .L1804 + .p2align 4,,10 + .p2align 3 +.L1803: + addq $8, %rcx + vpcmpeqd 0(%r13,%rcx,4), %ymm0, %ymm1 + vpxor %ymm1, %ymm2, %ymm1 + vmovmskps %ymm1, %eax + testl %eax, %eax + je .L1803 +.L1804: + tzcntl %eax, %eax + addq %rcx, %rax +.L1800: + vpbroadcastd 0(%r13,%rax,4), %ymm2 + vmovdqa %ymm0, %ymm8 + leaq 0(%r13,%rax,4), %rdi + vpcmpgtd %ymm0, %ymm2, %ymm1 + vmovdqa %ymm2, %ymm6 + vmovmskps %ymm1, %edx + testl %edx, %edx + jne .L1809 + movq -88(%rbp), %rsi + xorl %ecx, %ecx + leaq -8(%rsi), %rax + jmp .L1814 + .p2align 4,,10 + .p2align 3 +.L1810: + vmovmskps %ymm1, %edx + vmovdqu %ymm8, 0(%r13,%rax,4) + popcntq %rdx, %rdx + addq %rdx, %rcx + leaq -8(%rax), %rdx + cmpq %rdx, %rsi + jbe .L1956 + movq %rdx, %rax +.L1814: + vpcmpeqd 0(%r13,%rax,4), %ymm2, %ymm3 + vpcmpeqd 0(%r13,%rax,4), %ymm0, %ymm1 + vpor %ymm3, %ymm1, %ymm4 + vmovmskps %ymm4, %edx + cmpl $255, %edx + je .L1810 + vpcmpeqd %ymm0, %ymm0, %ymm0 + leaq 8(%rax), %rsi + vpxor %ymm0, %ymm1, %ymm7 + vpandn %ymm7, %ymm3, %ymm3 + vmovmskps %ymm3, %edx + tzcntl %edx, %edx + addq %rax, %rdx + addq $16, %rax + vpbroadcastd 0(%r13,%rdx,4), %ymm0 + movq -88(%rbp), %rdx + subq %rcx, %rdx + vmovdqa %ymm0, %ymm4 + vmovdqa %ymm0, -80(%rbp) + cmpq %rax, %rdx + jb .L1811 + .p2align 4,,10 + .p2align 3 +.L1812: + vmovdqu %ymm6, -32(%r13,%rax,4) + movq %rax, %rsi + addq $8, %rax + cmpq %rdx, %rax + jbe .L1812 +.L1811: + subq %rsi, %rdx + vmovd %edx, %xmm0 + vpbroadcastd %xmm0, %ymm0 + vpcmpgtd %ymm5, %ymm0, %ymm0 + vpmaskmovd %ymm2, %ymm0, 0(%r13,%rsi,4) +.L1813: + vpbroadcastd (%r12), %ymm3 + vpcmpeqd .LC15(%rip), %ymm3, %ymm1 + vmovdqa %ymm3, %ymm0 + vmovmskps %ymm1, %eax + cmpl $255, %eax + je .L1877 + vpcmpeqd .LC16(%rip), %ymm3, %ymm1 + vmovmskps %ymm1, %eax + cmpl $255, %eax + je .L1827 + vpminsd %ymm4, %ymm2, %ymm1 + vpcmpgtd %ymm1, %ymm3, %ymm1 + vmovmskps %ymm1, %eax + testl %eax, %eax + jne .L1957 + vmovdqa %ymm3, %ymm2 + movl $128, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1822: + leaq (%rcx,%rax,8), %rdx + addq $1, %rax + vpminsd 0(%r13,%rdx,4), %ymm2, %ymm1 + vmovdqa %ymm1, %ymm2 + cmpq $16, %rax + jne .L1822 + vpcmpgtd %ymm1, %ymm3, %ymm1 + vmovmskps %ymm1, %eax + testl %eax, %eax + jne .L1941 + leaq 128(%rsi), %rax + cmpq %rax, -88(%rbp) + jb .L1958 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1822 +.L1880: + vmovdqa .LC17(%rip), %ymm0 + xorl %r8d, %r8d + xorl %ecx, %ecx + movq %r13, %rax + leaq _ZZN3hwy6N_AVX26detail18IndicesFromBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array(%rip), %r9 + jmp .L1841 +.L1952: + vzeroupper + jmp .L1859 +.L1881: + movq %r10, %rdi + movq %rax, %rcx + jmp .L1852 +.L1882: + movq %rax, %r8 + movq %rdx, %rsi + movl $8, %ecx + leaq _ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array(%rip), %rdi + jmp .L1853 +.L1945: + movq -88(%rbp), %rsi + leaq -1(%rsi), %rbx + movq %rbx, %r12 + shrq %r12 + .p2align 4,,10 + .p2align 3 +.L1839: + movq %r12, %rdx + movq %r13, %rdi + call _ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + subq $1, %r12 + jnb .L1839 + .p2align 4,,10 + .p2align 3 +.L1840: + movl 0(%r13,%rbx,4), %edx + movl 0(%r13), %eax + movq %rbx, %rsi + movq %r13, %rdi + movl %edx, 0(%r13) + xorl %edx, %edx + movl %eax, 0(%r13,%rbx,4) + call _ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_mm.isra.0 + subq $1, %rbx + jne .L1840 + jmp .L1937 +.L1946: + movl (%r11), %esi + movl %esi, (%rax) + movl -4(%r11,%rcx), %esi + movl %esi, -4(%rax,%rcx) + jmp .L1845 +.L1948: + movl (%r12), %esi + movl %esi, (%r11) + movl -4(%r12,%rcx), %esi + movl %esi, -4(%r11,%rcx) + jmp .L1849 +.L1955: + movq -88(%rbp), %rsi + vpcmpeqd %ymm2, %ymm2, %ymm2 + .p2align 4,,10 + .p2align 3 +.L1806: + movq %rcx, %rdx + addq $8, %rcx + cmpq %rcx, %rsi + jb .L1959 + vpcmpeqd -32(%r13,%rcx,4), %ymm0, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vmovmskps %ymm1, %eax + testl %eax, %eax + je .L1806 +.L1939: + tzcntl %eax, %eax + addq %rdx, %rax + jmp .L1800 +.L1809: + movq -88(%rbp), %rsi + leaq -80(%rbp), %rdx + movq %r12, %rcx + vmovdqa %ymm2, %ymm1 + vmovdqa %ymm2, -144(%rbp) + subq %rax, %rsi + call _ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L1937 + vmovdqa -80(%rbp), %ymm4 + vmovdqa -144(%rbp), %ymm2 + jmp .L1813 +.L1956: + vmovd %eax, %xmm3 + vpcmpeqd 0(%r13), %ymm0, %ymm1 + movq -88(%rbp), %rdx + vpbroadcastd %xmm3, %ymm3 + vpcmpeqd 0(%r13), %ymm2, %ymm4 + vpcmpgtd %ymm5, %ymm3, %ymm3 + subq %rcx, %rdx + vpand %ymm1, %ymm3, %ymm7 + vpor %ymm4, %ymm1, %ymm1 + vpcmpeqd %ymm4, %ymm4, %ymm4 + vpxor %ymm3, %ymm4, %ymm3 + vpor %ymm3, %ymm1, %ymm1 + vmovmskps %ymm1, %esi + cmpl $255, %esi + jne .L1960 + vmovmskps %ymm7, %ecx + movq %rdx, %rax + vmovdqu %ymm0, 0(%r13) + popcntq %rcx, %rcx + subq %rcx, %rax + cmpq $7, %rax + jbe .L1866 + leaq -8(%rax), %rcx + movq -112(%rbp), %rsi + movq %rcx, %rdx + shrq $3, %rdx + salq $5, %rdx + leaq 32(%r13,%rdx), %rdx + .p2align 4,,10 + .p2align 3 +.L1819: + vmovdqu %ymm6, (%rsi) + addq $32, %rsi + cmpq %rdx, %rsi + jne .L1819 + andq $-8, %rcx + leaq 8(%rcx), %rdx + leaq 0(,%rdx,4), %rcx + subq %rdx, %rax +.L1818: + vmovdqa %ymm2, (%r12) + testq %rax, %rax + je .L1936 + leaq 0(%r13,%rcx), %rdi + leaq 0(,%rax,4), %rdx + movq %r12, %rsi + vzeroupper + call memcpy@PLT + jmp .L1937 +.L1958: + movq -88(%rbp), %rdx + jmp .L1829 + .p2align 4,,10 + .p2align 3 +.L1830: + vpcmpgtd -32(%r13,%rsi,4), %ymm3, %ymm1 + vmovmskps %ymm1, %eax + testl %eax, %eax + jne .L1941 +.L1829: + movq %rsi, %rax + addq $8, %rsi + cmpq %rsi, %rdx + jnb .L1830 + movq -88(%rbp), %rdi + cmpq %rax, %rdi + je .L1877 + vpcmpgtd -32(%r13,%rdi,4), %ymm3, %ymm3 + vmovmskps %ymm3, %eax + cmpl $1, %eax + movl $1, %eax + adcl $0, %eax + movl %eax, -112(%rbp) + jmp .L1831 +.L1959: + movq -88(%rbp), %rax + vpcmpeqd %ymm2, %ymm2, %ymm2 + vpcmpeqd -32(%r13,%rax,4), %ymm0, %ymm1 + leaq -8(%rax), %rdx + vpxor %ymm2, %ymm1, %ymm1 + vmovmskps %ymm1, %eax + testl %eax, %eax + jne .L1939 +.L1936: + vzeroupper + jmp .L1937 +.L1826: + vmovdqu -32(%r13,%rsi,4), %ymm5 + vpcmpgtd %ymm3, %ymm5, %ymm1 + vmovmskps %ymm1, %eax + testl %eax, %eax + jne .L1941 +.L1825: + movq %rsi, %rax + addq $8, %rsi + cmpq %rsi, -88(%rbp) + jnb .L1826 + movq -88(%rbp), %rdi + cmpq %rax, %rdi + je .L1827 + vmovdqu -32(%r13,%rdi,4), %ymm5 + vmovdqa %ymm5, -144(%rbp) + vmovdqa -144(%rbp), %ymm5 + vpcmpgtd %ymm3, %ymm5, %ymm1 + vmovmskps %ymm1, %eax + testl %eax, %eax + jne .L1941 +.L1827: + vpcmpeqd %ymm0, %ymm0, %ymm0 + movl $3, -112(%rbp) + vpaddd %ymm0, %ymm3, %ymm0 + jmp .L1831 +.L1954: + tzcntl %eax, %eax + jmp .L1800 +.L1953: + movq %rdx, %rcx + xorl %eax, %eax + cmpq $7, %rdx + jbe .L1787 + movq %rdx, %rbx + leaq -8(%rdx), %rdx + movq (%rdi), %rcx + movq %rdx, %rax + shrq $3, %rax + movq %rcx, (%r12) + addq $1, %rax + salq $5, %rax + movl %eax, %ecx + movq -8(%rdi,%rcx), %rsi + leaq 8(%r12), %rdi + andq $-8, %rdi + movq %rsi, -8(%r12,%rcx) + movq %r12, %rcx + movq %r13, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %eax, %ecx + movq %rdx, %rax + andq $-8, %rax + shrl $3, %ecx + rep movsq + addq $8, %rax + subq %rax, %rbx + movq %rbx, %rcx + je .L1790 +.L1787: + salq $2, %rax + leaq 0(,%rcx,4), %rdx + testq %rcx, %rcx + movl $4, %ecx + cmove %rcx, %rdx + leaq (%r12,%rax), %rdi + leaq 0(%r13,%rax), %rsi + call memcpy@PLT +.L1790: + movq -88(%rbp), %rbx + movl $32, %edx + movl $1, %esi + movl %ebx, %eax + subl $1, %eax + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %edx + movl $1, %eax + shlx %rdx, %rsi, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + addq $8, %rdx + cmpq %rdx, %rbx + jnb .L1789 + vmovdqa .LC14(%rip), %ymm0 + movq %rbx, %rax +.L1788: + vmovdqu %ymm0, (%r12,%rax,4) + addq $8, %rax + cmpq %rdx, %rax + jb .L1788 + vzeroupper +.L1789: + movq %r12, %rdi + call _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + xorl %eax, %eax + cmpq $7, -88(%rbp) + jbe .L1792 + movq -88(%rbp), %rbx + movq (%r12), %rcx + leaq 8(%r13), %rdi + andq $-8, %rdi + leaq -8(%rbx), %rdx + movq %rcx, 0(%r13) + movq %rdx, %rax + shrq $3, %rax + addq $1, %rax + salq $5, %rax + movl %eax, %ecx + movq -8(%r12,%rcx), %rsi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + movq %r12, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %eax, %ecx + movq %rdx, %rax + andq $-8, %rax + shrl $3, %ecx + rep movsq + addq $8, %rax + subq %rax, %rbx + movq %rbx, -88(%rbp) + je .L1937 +.L1792: + movq -88(%rbp), %rbx + salq $2, %rax + movl $4, %ecx + leaq 0(%r13,%rax), %rdi + leaq (%r12,%rax), %rsi + leaq 0(,%rbx,4), %rdx + testq %rbx, %rbx + cmove %rcx, %rdx + call memcpy@PLT + jmp .L1937 +.L1877: + movl $2, -112(%rbp) + jmp .L1831 +.L1957: + vpmaxsd %ymm4, %ymm2, %ymm1 + vpcmpgtd %ymm3, %ymm1, %ymm1 + vmovmskps %ymm1, %eax + testl %eax, %eax + jne .L1941 + vmovdqa %ymm3, %ymm2 + movl $128, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1823: + leaq (%rcx,%rax,8), %rdx + addq $1, %rax + vpmaxsd 0(%r13,%rdx,4), %ymm2, %ymm1 + vmovdqa %ymm1, %ymm2 + cmpq $16, %rax + jne .L1823 + vpcmpgtd %ymm3, %ymm1, %ymm1 + vmovmskps %ymm1, %eax + testl %eax, %eax + jne .L1941 + leaq 128(%rsi), %rax + cmpq %rax, -88(%rbp) + jb .L1825 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1823 +.L1960: + vpxor %ymm4, %ymm1, %ymm1 + vmovmskps %ymm1, %ecx + tzcntl %ecx, %ecx + vpbroadcastd 0(%r13,%rcx,4), %ymm0 + leaq 8(%rax), %rcx + vmovdqa %ymm0, %ymm4 + vmovdqa %ymm0, -80(%rbp) + cmpq %rdx, %rcx + ja .L1816 +.L1817: + vmovdqu %ymm6, -32(%r13,%rcx,4) + movq %rcx, %rax + addq $8, %rcx + cmpq %rdx, %rcx + jbe .L1817 +.L1816: + subq %rax, %rdx + vmovd %edx, %xmm0 + vpbroadcastd %xmm0, %ymm0 + vpcmpgtd %ymm5, %ymm0, %ymm0 + vpmaskmovd %ymm2, %ymm0, 0(%r13,%rax,4) + jmp .L1813 +.L1866: + xorl %ecx, %ecx + jmp .L1818 + .cfi_endproc +.LFE18809: + .size _ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0, .-_ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + .section .text._ZN3hwy7N_SSSE310SortI32AscEPim,"ax",@progbits + .p2align 4 + .globl _ZN3hwy7N_SSSE310SortI32AscEPim + .hidden _ZN3hwy7N_SSSE310SortI32AscEPim + .type _ZN3hwy7N_SSSE310SortI32AscEPim, @function +_ZN3hwy7N_SSSE310SortI32AscEPim: +.LFB2946: + .cfi_startproc + cmpq $1, %rsi + jbe .L1983 + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r14 + .cfi_offset 14, -24 + leaq (%rdi,%rsi,4), %r14 + pushq %r13 + .cfi_offset 13, -32 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -40 + movq %rsi, %r12 + subq $296, %rsp + cmpq $64, %rsi + jbe .L1986 + call _ZN3hwy17GetGeneratorStateEv@PLT + movq %r12, %rdx + movq %r14, %rsi + movq %r13, %rdi + movq %rax, %r8 + leaq -320(%rbp), %rcx + movl $50, %r9d + call _ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1961: + addq $296, %rsp + popq %r12 + popq %r13 + popq %r14 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1983: + .cfi_restore 6 + .cfi_restore 12 + .cfi_restore 13 + .cfi_restore 14 + ret + .p2align 4,,10 + .p2align 3 +.L1986: + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + .cfi_offset 12, -40 + .cfi_offset 13, -32 + .cfi_offset 14, -24 + leaq 256(%rdi), %rax + cmpq %rax, %r14 + jb .L1987 + movl $4, %esi + call _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + jmp .L1961 +.L1987: + movq %rsi, %rcx + xorl %eax, %eax + leaq -320(%rbp), %r14 + cmpq $3, %rsi + jbe .L1967 + leaq -4(%rsi), %rax + leaq -320(%rbp), %r14 + movq %r13, %rsi + movq %rax, %rdx + andq $-4, %rax + movq %r14, %rdi + shrq $2, %rdx + addq $4, %rax + leal 2(%rdx,%rdx), %ecx + andl $536870910, %ecx + rep movsq + movq %r12, %rcx + subq %rax, %rcx + je .L1973 +.L1967: + salq $2, %rax + leaq 0(,%rcx,4), %rdx + testq %rcx, %rcx + movl $4, %ecx + cmove %rcx, %rdx + leaq (%r14,%rax), %rdi + leaq 0(%r13,%rax), %rsi + call memcpy@PLT +.L1973: + leal -1(%r12), %eax + movl $32, %ecx + movl $1, %esi + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + addq $4, %rdx + cmpq %rdx, %r12 + jnb .L1972 + movdqa .LC4(%rip), %xmm0 + movq %r12, %rax + .p2align 4,,10 + .p2align 3 +.L1971: + movups %xmm0, (%r14,%rax,4) + addq $4, %rax + cmpq %rdx, %rax + jb .L1971 +.L1972: + movq %r14, %rdi + call _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + xorl %eax, %eax + cmpq $3, %r12 + jbe .L1975 + leaq -4(%r12), %rdx + movq -320(%rbp), %rcx + leaq 8(%r13), %rdi + movq %rdx, %rax + andq $-8, %rdi + andq $-4, %rdx + shrq $2, %rax + movq %rcx, 0(%r13) + addq $1, %rax + salq $4, %rax + movl %eax, %ecx + movq -8(%r14,%rcx), %rsi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + movq %r14, %rsi + subq %rdi, %rcx + addl %ecx, %eax + subq %rcx, %rsi + shrl $3, %eax + movl %eax, %ecx + leaq 4(%rdx), %rax + rep movsq + subq %rax, %r12 + je .L1961 +.L1975: + salq $2, %rax + leaq 0(,%r12,4), %rdx + testq %r12, %r12 + movl $4, %ecx + cmove %rcx, %rdx + leaq 0(%r13,%rax), %rdi + leaq (%r14,%rax), %rsi + call memcpy@PLT + jmp .L1961 + .cfi_endproc +.LFE2946: + .size _ZN3hwy7N_SSSE310SortI32AscEPim, .-_ZN3hwy7N_SSSE310SortI32AscEPim + .section .text._ZN3hwy6N_SSE410SortI32AscEPim,"ax",@progbits + .p2align 4 + .globl _ZN3hwy6N_SSE410SortI32AscEPim + .hidden _ZN3hwy6N_SSE410SortI32AscEPim + .type _ZN3hwy6N_SSE410SortI32AscEPim, @function +_ZN3hwy6N_SSE410SortI32AscEPim: +.LFB4014: + .cfi_startproc + cmpq $1, %rsi + jbe .L2010 + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r14 + .cfi_offset 14, -24 + leaq (%rdi,%rsi,4), %r14 + pushq %r13 + .cfi_offset 13, -32 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -40 + movq %rsi, %r12 + subq $296, %rsp + cmpq $64, %rsi + jbe .L2013 + call _ZN3hwy17GetGeneratorStateEv@PLT + movq %r12, %rdx + movq %r14, %rsi + movq %r13, %rdi + movq %rax, %r8 + leaq -320(%rbp), %rcx + movl $50, %r9d + call _ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1988: + addq $296, %rsp + popq %r12 + popq %r13 + popq %r14 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L2010: + .cfi_restore 6 + .cfi_restore 12 + .cfi_restore 13 + .cfi_restore 14 + ret + .p2align 4,,10 + .p2align 3 +.L2013: + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + .cfi_offset 12, -40 + .cfi_offset 13, -32 + .cfi_offset 14, -24 + leaq 256(%rdi), %rax + cmpq %rax, %r14 + jb .L2014 + movl $4, %esi + call _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + jmp .L1988 +.L2014: + movq %rsi, %rcx + xorl %eax, %eax + leaq -320(%rbp), %r14 + cmpq $3, %rsi + jbe .L1994 + leaq -4(%rsi), %rax + leaq -320(%rbp), %r14 + movq %r13, %rsi + movq %rax, %rdx + andq $-4, %rax + movq %r14, %rdi + shrq $2, %rdx + addq $4, %rax + leal 2(%rdx,%rdx), %ecx + andl $536870910, %ecx + rep movsq + movq %r12, %rcx + subq %rax, %rcx + je .L2000 +.L1994: + salq $2, %rax + leaq 0(,%rcx,4), %rdx + testq %rcx, %rcx + movl $4, %ecx + cmove %rcx, %rdx + leaq (%r14,%rax), %rdi + leaq 0(%r13,%rax), %rsi + call memcpy@PLT +.L2000: + leal -1(%r12), %eax + movl $32, %ecx + movl $1, %esi + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + addq $4, %rdx + cmpq %rdx, %r12 + jnb .L1999 + movdqa .LC4(%rip), %xmm0 + movq %r12, %rax + .p2align 4,,10 + .p2align 3 +.L1998: + movups %xmm0, (%r14,%rax,4) + addq $4, %rax + cmpq %rdx, %rax + jb .L1998 +.L1999: + movq %r14, %rdi + call _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + xorl %eax, %eax + cmpq $3, %r12 + jbe .L2002 + leaq -4(%r12), %rdx + movq -320(%rbp), %rcx + leaq 8(%r13), %rdi + movq %rdx, %rax + andq $-8, %rdi + andq $-4, %rdx + shrq $2, %rax + movq %rcx, 0(%r13) + addq $1, %rax + salq $4, %rax + movl %eax, %ecx + movq -8(%r14,%rcx), %rsi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + movq %r14, %rsi + subq %rdi, %rcx + addl %ecx, %eax + subq %rcx, %rsi + shrl $3, %eax + movl %eax, %ecx + leaq 4(%rdx), %rax + rep movsq + subq %rax, %r12 + je .L1988 +.L2002: + salq $2, %rax + leaq 0(,%r12,4), %rdx + testq %r12, %r12 + movl $4, %ecx + cmove %rcx, %rdx + leaq 0(%r13,%rax), %rdi + leaq (%r14,%rax), %rsi + call memcpy@PLT + jmp .L1988 + .cfi_endproc +.LFE4014: + .size _ZN3hwy6N_SSE410SortI32AscEPim, .-_ZN3hwy6N_SSE410SortI32AscEPim + .section .text._ZN3hwy6N_AVX210SortI32AscEPim,"ax",@progbits + .p2align 4 + .globl _ZN3hwy6N_AVX210SortI32AscEPim + .hidden _ZN3hwy6N_AVX210SortI32AscEPim + .type _ZN3hwy6N_AVX210SortI32AscEPim, @function +_ZN3hwy6N_AVX210SortI32AscEPim: +.LFB10439: + .cfi_startproc + cmpq $1, %rsi + jbe .L2038 + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r14 + .cfi_offset 14, -24 + leaq (%rdi,%rsi,4), %r14 + pushq %r13 + .cfi_offset 13, -32 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -40 + movq %rsi, %r12 + andq $-32, %rsp + subq $576, %rsp + cmpq $128, %rsi + jbe .L2041 + call _ZN3hwy17GetGeneratorStateEv@PLT + movq %rsp, %rcx + movq %r12, %rdx + movq %r14, %rsi + movq %rax, %r8 + movl $50, %r9d + movq %r13, %rdi + call _ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIiLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L2036: + leaq -24(%rbp), %rsp + popq %r12 + popq %r13 + popq %r14 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L2038: + .cfi_restore 6 + .cfi_restore 12 + .cfi_restore 13 + .cfi_restore 14 + ret + .p2align 4,,10 + .p2align 3 +.L2041: + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + .cfi_offset 12, -40 + .cfi_offset 13, -32 + .cfi_offset 14, -24 + leaq 512(%rdi), %rax + cmpq %rax, %r14 + jb .L2042 + movl $8, %esi + call _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + jmp .L2036 +.L2042: + movq %rsi, %rcx + xorl %eax, %eax + movq %rsp, %r14 + cmpq $7, %rsi + jbe .L2021 + leaq -8(%rsi), %rax + movq %rsp, %r14 + movq %r13, %rsi + movq %rax, %rdx + andq $-8, %rax + movq %r14, %rdi + shrq $3, %rdx + addq $8, %rax + leal 4(,%rdx,4), %ecx + andl $536870908, %ecx + rep movsq + movq %r12, %rcx + subq %rax, %rcx + je .L2027 +.L2021: + salq $2, %rax + leaq 0(,%rcx,4), %rdx + testq %rcx, %rcx + movl $4, %ecx + cmove %rcx, %rdx + leaq (%r14,%rax), %rdi + leaq 0(%r13,%rax), %rsi + call memcpy@PLT +.L2027: + leal -1(%r12), %eax + movl $32, %edx + movl $1, %esi + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %edx + movl $1, %eax + shlx %rdx, %rsi, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + addq $8, %rdx + cmpq %rdx, %r12 + jnb .L2026 + vmovdqa .LC14(%rip), %ymm0 + movq %r12, %rax + .p2align 4,,10 + .p2align 3 +.L2025: + vmovdqu %ymm0, (%r14,%rax,4) + addq $8, %rax + cmpq %rdx, %rax + jb .L2025 + vzeroupper +.L2026: + movq %r14, %rdi + call _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + xorl %eax, %eax + cmpq $7, %r12 + jbe .L2029 + leaq -8(%r12), %rdx + movq (%rsp), %rcx + leaq 8(%r13), %rdi + movq %rdx, %rax + andq $-8, %rdi + andq $-8, %rdx + shrq $3, %rax + movq %rcx, 0(%r13) + addq $1, %rax + salq $5, %rax + movl %eax, %ecx + movq -8(%r14,%rcx), %rsi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + movq %r14, %rsi + subq %rdi, %rcx + addl %ecx, %eax + subq %rcx, %rsi + shrl $3, %eax + movl %eax, %ecx + leaq 8(%rdx), %rax + rep movsq + subq %rax, %r12 + je .L2036 +.L2029: + salq $2, %rax + leaq 0(,%r12,4), %rdx + testq %r12, %r12 + movl $4, %ecx + cmove %rcx, %rdx + leaq 0(%r13,%rax), %rdi + leaq (%r14,%rax), %rsi + call memcpy@PLT + jmp .L2036 + .cfi_endproc +.LFE10439: + .size _ZN3hwy6N_AVX210SortI32AscEPim, .-_ZN3hwy6N_AVX210SortI32AscEPim + .section .text._ZN3hwy6N_AVX310SortI32AscEPim,"ax",@progbits + .p2align 4 + .globl _ZN3hwy6N_AVX310SortI32AscEPim + .hidden _ZN3hwy6N_AVX310SortI32AscEPim + .type _ZN3hwy6N_AVX310SortI32AscEPim, @function +_ZN3hwy6N_AVX310SortI32AscEPim: +.LFB12848: + .cfi_startproc + cmpq $1, %rsi + jbe .L2062 + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + leaq (%rdi,%rsi,4), %r14 + pushq %r13 + .cfi_offset 13, -40 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -48 + movq %rsi, %r12 + pushq %rbx + andq $-64, %rsp + subq $1152, %rsp + .cfi_offset 3, -56 + cmpq $256, %rsi + jbe .L2065 + call _ZN3hwy17GetGeneratorStateEv@PLT + movq %rsp, %rcx + movq %r12, %rdx + movq %r14, %rsi + movq %rax, %r8 + movl $50, %r9d + movq %r13, %rdi + call _ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L2060: + leaq -40(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L2062: + .cfi_restore 3 + .cfi_restore 6 + .cfi_restore 12 + .cfi_restore 13 + .cfi_restore 14 + .cfi_restore 15 + ret + .p2align 4,,10 + .p2align 3 +.L2065: + .cfi_def_cfa 6, 16 + .cfi_offset 3, -56 + .cfi_offset 6, -16 + .cfi_offset 12, -48 + .cfi_offset 13, -40 + .cfi_offset 14, -32 + .cfi_offset 15, -24 + leaq 1024(%rdi), %rax + cmpq %rax, %r14 + jb .L2066 + movl $16, %esi + call _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + jmp .L2060 +.L2066: + xorl %eax, %eax + movq %rsp, %r14 + cmpq $15, %rsi + jbe .L2048 + leaq -16(%rsi), %rax + movq %rsp, %r14 + movq %r13, %rsi + movq %rax, %rdx + movq %r14, %rdi + andq $-16, %rax + shrq $4, %rdx + addq $16, %rax + leal 8(,%rdx,8), %ecx + andl $536870904, %ecx + rep movsq +.L2048: + movq %r12, %rdx + leaq 0(,%rax,4), %rbx + subq %rax, %rdx + movl $65535, %eax + leaq (%r14,%rbx), %r15 + addq %r13, %rbx + kmovd %eax, %k4 + cmpq $255, %rdx + ja .L2052 + movq $-1, %rax + bzhi %rdx, %rax, %rax + movzwl %ax, %eax + kmovd %eax, %k4 +.L2052: + leal -1(%r12), %eax + movl $32, %edx + movl $1, %esi + vmovdqu32 (%rbx), %zmm0{%k4}{z} + bsrl %eax, %eax + xorl $31, %eax + vmovdqa32 %zmm0, (%r15){%k4} + vpbroadcastq .LC10(%rip), %zmm0 + subl %eax, %edx + movl $1, %eax + shlx %rdx, %rsi, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %r12, %rax + leaq 1(%rsi), %rdx + salq $4, %rdx + cmpq %rdx, %r12 + jnb .L2056 + .p2align 4,,10 + .p2align 3 +.L2053: + vmovdqu64 %zmm0, (%r14,%rax,4) + addq $16, %rax + cmpq %rax, %rdx + ja .L2053 +.L2056: + movq %r14, %rdi + vzeroupper + call _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + cmpq $15, %r12 + jbe .L2055 + leaq -16(%r12), %rax + movq (%rsp), %rdx + leaq 8(%r13), %rdi + movq %r14, %rsi + shrq $4, %rax + andq $-8, %rdi + addq $1, %rax + movq %rdx, 0(%r13) + salq $6, %rax + movl %eax, %edx + movq -8(%r14,%rdx), %rcx + movq %rcx, -8(%r13,%rdx) + subq %rdi, %r13 + addl %r13d, %eax + subq %r13, %rsi + shrl $3, %eax + movl %eax, %ecx + rep movsq +.L2055: + vmovdqa32 (%r15), %zmm0{%k4}{z} + vmovdqu32 %zmm0, (%rbx){%k4} + vzeroupper + jmp .L2060 + .cfi_endproc +.LFE12848: + .size _ZN3hwy6N_AVX310SortI32AscEPim, .-_ZN3hwy6N_AVX310SortI32AscEPim + .section .text._ZN3hwy11N_AVX3_ZEN410SortI32AscEPim,"ax",@progbits + .p2align 4 + .globl _ZN3hwy11N_AVX3_ZEN410SortI32AscEPim + .hidden _ZN3hwy11N_AVX3_ZEN410SortI32AscEPim + .type _ZN3hwy11N_AVX3_ZEN410SortI32AscEPim, @function +_ZN3hwy11N_AVX3_ZEN410SortI32AscEPim: +.LFB15264: + .cfi_startproc + cmpq $1, %rsi + jbe .L2086 + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + leaq (%rdi,%rsi,4), %r14 + pushq %r13 + .cfi_offset 13, -40 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -48 + movq %rsi, %r12 + pushq %rbx + andq $-64, %rsp + subq $1152, %rsp + .cfi_offset 3, -56 + cmpq $256, %rsi + jbe .L2089 + call _ZN3hwy17GetGeneratorStateEv@PLT + movq %rsp, %rcx + movq %r12, %rdx + movq %r14, %rsi + movq %rax, %r8 + movl $50, %r9d + movq %r13, %rdi + call _ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIiLm16ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L2084: + leaq -40(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L2086: + .cfi_restore 3 + .cfi_restore 6 + .cfi_restore 12 + .cfi_restore 13 + .cfi_restore 14 + .cfi_restore 15 + ret + .p2align 4,,10 + .p2align 3 +.L2089: + .cfi_def_cfa 6, 16 + .cfi_offset 3, -56 + .cfi_offset 6, -16 + .cfi_offset 12, -48 + .cfi_offset 13, -40 + .cfi_offset 14, -32 + .cfi_offset 15, -24 + leaq 1024(%rdi), %rax + cmpq %rax, %r14 + jb .L2090 + movl $16, %esi + call _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + jmp .L2084 +.L2090: + xorl %eax, %eax + movq %rsp, %r14 + cmpq $15, %rsi + jbe .L2072 + leaq -16(%rsi), %rax + movq %rsp, %r14 + movq %r13, %rsi + movq %rax, %rdx + movq %r14, %rdi + andq $-16, %rax + shrq $4, %rdx + addq $16, %rax + leal 8(,%rdx,8), %ecx + andl $536870904, %ecx + rep movsq +.L2072: + movq %r12, %rdx + leaq 0(,%rax,4), %rbx + subq %rax, %rdx + movl $65535, %eax + leaq (%r14,%rbx), %r15 + addq %r13, %rbx + kmovd %eax, %k4 + cmpq $255, %rdx + ja .L2076 + movq $-1, %rax + bzhi %rdx, %rax, %rax + movzwl %ax, %eax + kmovd %eax, %k4 +.L2076: + leal -1(%r12), %eax + movl $32, %edx + movl $1, %esi + vmovdqu32 (%rbx), %zmm0{%k4}{z} + bsrl %eax, %eax + xorl $31, %eax + vmovdqa32 %zmm0, (%r15){%k4} + vpbroadcastq .LC10(%rip), %zmm0 + subl %eax, %edx + movl $1, %eax + shlx %rdx, %rsi, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %r12, %rax + leaq 1(%rsi), %rdx + salq $4, %rdx + cmpq %rdx, %r12 + jnb .L2080 + .p2align 4,,10 + .p2align 3 +.L2077: + vmovdqu64 %zmm0, (%r14,%rax,4) + addq $16, %rax + cmpq %rax, %rdx + ja .L2077 +.L2080: + movq %r14, %rdi + vzeroupper + call _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + cmpq $15, %r12 + jbe .L2079 + leaq -16(%r12), %rax + movq (%rsp), %rdx + leaq 8(%r13), %rdi + movq %r14, %rsi + shrq $4, %rax + andq $-8, %rdi + addq $1, %rax + movq %rdx, 0(%r13) + salq $6, %rax + movl %eax, %edx + movq -8(%r14,%rdx), %rcx + movq %rcx, -8(%r13,%rdx) + subq %rdi, %r13 + addl %r13d, %eax + subq %r13, %rsi + shrl $3, %eax + movl %eax, %ecx + rep movsq +.L2079: + vmovdqa32 (%r15), %zmm0{%k4}{z} + vmovdqu32 %zmm0, (%rbx){%k4} + vzeroupper + jmp .L2084 + .cfi_endproc +.LFE15264: + .size _ZN3hwy11N_AVX3_ZEN410SortI32AscEPim, .-_ZN3hwy11N_AVX3_ZEN410SortI32AscEPim + .section .text._ZN3hwy6N_SSE210SortI32AscEPim,"ax",@progbits + .p2align 4 + .globl _ZN3hwy6N_SSE210SortI32AscEPim + .hidden _ZN3hwy6N_SSE210SortI32AscEPim + .type _ZN3hwy6N_SSE210SortI32AscEPim, @function +_ZN3hwy6N_SSE210SortI32AscEPim: +.LFB16254: + .cfi_startproc + cmpq $1, %rsi + jbe .L2113 + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r14 + .cfi_offset 14, -24 + leaq (%rdi,%rsi,4), %r14 + pushq %r13 + .cfi_offset 13, -32 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -40 + movq %rsi, %r12 + subq $296, %rsp + cmpq $64, %rsi + jbe .L2116 + call _ZN3hwy17GetGeneratorStateEv@PLT + movq %r12, %rdx + movq %r14, %rsi + movq %r13, %rdi + movq %rax, %r8 + leaq -320(%rbp), %rcx + movl $50, %r9d + call _ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIiLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L2091: + addq $296, %rsp + popq %r12 + popq %r13 + popq %r14 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L2113: + .cfi_restore 6 + .cfi_restore 12 + .cfi_restore 13 + .cfi_restore 14 + ret + .p2align 4,,10 + .p2align 3 +.L2116: + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + .cfi_offset 12, -40 + .cfi_offset 13, -32 + .cfi_offset 14, -24 + leaq 256(%rdi), %rax + cmpq %rax, %r14 + jb .L2117 + movl $4, %esi + call _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + jmp .L2091 +.L2117: + movq %rsi, %rcx + xorl %eax, %eax + leaq -320(%rbp), %r14 + cmpq $3, %rsi + jbe .L2097 + leaq -4(%rsi), %rax + leaq -320(%rbp), %r14 + movq %r13, %rsi + movq %rax, %rdx + andq $-4, %rax + movq %r14, %rdi + shrq $2, %rdx + addq $4, %rax + leal 2(%rdx,%rdx), %ecx + andl $536870910, %ecx + rep movsq + movq %r12, %rcx + subq %rax, %rcx + je .L2103 +.L2097: + salq $2, %rax + leaq 0(,%rcx,4), %rdx + testq %rcx, %rcx + movl $4, %ecx + cmove %rcx, %rdx + leaq (%r14,%rax), %rdi + leaq 0(%r13,%rax), %rsi + call memcpy@PLT +.L2103: + leal -1(%r12), %eax + movl $32, %ecx + movl $1, %esi + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + addq $4, %rdx + cmpq %rdx, %r12 + jnb .L2102 + movdqa .LC4(%rip), %xmm0 + movq %r12, %rax + .p2align 4,,10 + .p2align 3 +.L2101: + movups %xmm0, (%r14,%rax,4) + addq $4, %rax + cmpq %rdx, %rax + jb .L2101 +.L2102: + movq %r14, %rdi + call _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIiEEEEEEiEEvT_PT0_m.isra.0 + xorl %eax, %eax + cmpq $3, %r12 + jbe .L2105 + leaq -4(%r12), %rdx + movq -320(%rbp), %rcx + leaq 8(%r13), %rdi + movq %rdx, %rax + andq $-8, %rdi + andq $-4, %rdx + shrq $2, %rax + movq %rcx, 0(%r13) + addq $1, %rax + salq $4, %rax + movl %eax, %ecx + movq -8(%r14,%rcx), %rsi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + movq %r14, %rsi + subq %rdi, %rcx + addl %ecx, %eax + subq %rcx, %rsi + shrl $3, %eax + movl %eax, %ecx + leaq 4(%rdx), %rax + rep movsq + subq %rax, %r12 + je .L2091 +.L2105: + salq $2, %rax + leaq 0(,%r12,4), %rdx + testq %r12, %r12 + movl $4, %ecx + cmove %rcx, %rdx + leaq 0(%r13,%rax), %rdi + leaq (%r14,%rax), %rsi + call memcpy@PLT + jmp .L2091 + .cfi_endproc +.LFE16254: + .size _ZN3hwy6N_SSE210SortI32AscEPim, .-_ZN3hwy6N_SSE210SortI32AscEPim + .section .text.vqsort_int32_avx2,"ax",@progbits + .p2align 4 + .globl vqsort_int32_avx2 + .hidden vqsort_int32_avx2 + .type vqsort_int32_avx2, @function +vqsort_int32_avx2: +.LFB16255: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + call _ZN3hwy6N_AVX210SortI32AscEPim + popq %rbp + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE16255: + .size vqsort_int32_avx2, .-vqsort_int32_avx2 + .section .text.vqsort_int32_sse4,"ax",@progbits + .p2align 4 + .globl vqsort_int32_sse4 + .hidden vqsort_int32_sse4 + .type vqsort_int32_sse4, @function +vqsort_int32_sse4: +.LFB16256: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + call _ZN3hwy6N_SSE410SortI32AscEPim + popq %rbp + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE16256: + .size vqsort_int32_sse4, .-vqsort_int32_sse4 + .section .text.vqsort_int32_ssse3,"ax",@progbits + .p2align 4 + .globl vqsort_int32_ssse3 + .hidden vqsort_int32_ssse3 + .type vqsort_int32_ssse3, @function +vqsort_int32_ssse3: +.LFB16257: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + call _ZN3hwy7N_SSSE310SortI32AscEPim + popq %rbp + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE16257: + .size vqsort_int32_ssse3, .-vqsort_int32_ssse3 + .section .text.vqsort_int32_sse2,"ax",@progbits + .p2align 4 + .globl vqsort_int32_sse2 + .hidden vqsort_int32_sse2 + .type vqsort_int32_sse2, @function +vqsort_int32_sse2: +.LFB16258: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + call _ZN3hwy6N_SSE210SortI32AscEPim + popq %rbp + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE16258: + .size vqsort_int32_sse2, .-vqsort_int32_sse2 + .hidden _ZZN3hwy6N_SSE26detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .weak _ZZN3hwy6N_SSE26detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .section .rodata._ZZN3hwy6N_SSE26detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,"aG",@progbits,_ZZN3hwy6N_SSE26detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,comdat + .align 16 + .type _ZZN3hwy6N_SSE26detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, @object + .size _ZZN3hwy6N_SSE26detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, 256 +_ZZN3hwy6N_SSE26detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices: + .string "" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003" + .string "\001\002\003\b\t\n\013\f\r\016\017\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007" + .string "\001\002\003\004\005\006\007\f\r\016\017\b\t\n\013\004\005\006\007\f\r\016\017" + .string "\001\002\003\b\t\n\013" + .string "\001\002\003\f\r\016\017\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\004\005\006\007\b\t\n\013" + .string "\001\002\003\f\r\016\017" + .string "\001\002\003\b\t\n\013\004\005\006\007\f\r\016\017\b\t\n\013" + .string "\001\002\003\004\005\006\007\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\004\005\006\007" + .string "\001\002\003\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .ascii "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .hidden _ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array + .weak _ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array + .section .rodata._ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array,"aG",@progbits,_ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array,comdat + .align 16 + .type _ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array, @object + .size _ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array, 1024 +_ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array: + .long -19088744 + .long -1880241239 + .long -1611805784 + .long -1728127814 + .long -1343370344 + .long -1459692359 + .long -1442915144 + .long -1450185269 + .long -1074935144 + .long -1191256919 + .long -1174479704 + .long -1181749814 + .long -1157702504 + .long -1164972599 + .long -1163924024 + .long -1164378404 + .long -806503784 + .long -922821719 + .long -906044504 + .long -913314374 + .long -889267304 + .long -896537159 + .long -895488584 + .long -895942949 + .long -872490344 + .long -879759959 + .long -878711384 + .long -879165734 + .long -877662824 + .long -878117159 + .long -878051624 + .long -878080019 + .long -538133864 + .long -654390359 + .long -637613144 + .long -644879174 + .long -620835944 + .long -628101959 + .long -627053384 + .long -627507509 + .long -604058984 + .long -611324759 + .long -610276184 + .long -610730294 + .long -609227624 + .long -609681719 + .long -609616184 + .long -609644564 + .long -587285864 + .long -594547799 + .long -593499224 + .long -593953094 + .long -592450664 + .long -592904519 + .long -592838984 + .long -592867349 + .long -591402344 + .long -591855959 + .long -591790424 + .long -591818774 + .long -591724904 + .long -591753239 + .long -591749144 + .long -591750914 + .long -270746984 + .long -386020439 + .long -369243224 + .long -376447814 + .long -352466024 + .long -359670599 + .long -358622024 + .long -359072309 + .long -335689064 + .long -342893399 + .long -341844824 + .long -342295094 + .long -340796264 + .long -341246519 + .long -341180984 + .long -341209124 + .long -318915944 + .long -326116439 + .long -325067864 + .long -325517894 + .long -324019304 + .long -324469319 + .long -324403784 + .long -324431909 + .long -322970984 + .long -323420759 + .long -323355224 + .long -323383334 + .long -323289704 + .long -323317799 + .long -323313704 + .long -323315459 + .long -302204264 + .long -309343319 + .long -308294744 + .long -308740934 + .long -307246184 + .long -307692359 + .long -307626824 + .long -307654709 + .long -306197864 + .long -306643799 + .long -306578264 + .long -306606134 + .long -306512744 + .long -306540599 + .long -306536504 + .long -306538244 + .long -305153384 + .long -305595479 + .long -305529944 + .long -305557574 + .long -305464424 + .long -305492039 + .long -305487944 + .long -305489669 + .long -305399144 + .long -305426519 + .long -305422424 + .long -305424134 + .long -305418344 + .long -305420039 + .long -305419784 + .long -305419889 + .long -19088744 + .long -118633559 + .long -101856344 + .long -108077894 + .long -85079144 + .long -91300679 + .long -90252104 + .long -90640949 + .long -68302184 + .long -74523479 + .long -73474904 + .long -73863734 + .long -72426344 + .long -72815159 + .long -72749624 + .long -72773924 + .long -51529064 + .long -57746519 + .long -56697944 + .long -57086534 + .long -55649384 + .long -56037959 + .long -55972424 + .long -55996709 + .long -54601064 + .long -54989399 + .long -54923864 + .long -54948134 + .long -54858344 + .long -54882599 + .long -54878504 + .long -54880019 + .long -34817384 + .long -40973399 + .long -39924824 + .long -40309574 + .long -38876264 + .long -39260999 + .long -39195464 + .long -39219509 + .long -37827944 + .long -38212439 + .long -38146904 + .long -38170934 + .long -38081384 + .long -38105399 + .long -38101304 + .long -38102804 + .long -36783464 + .long -37164119 + .long -37098584 + .long -37122374 + .long -37033064 + .long -37056839 + .long -37052744 + .long -37054229 + .long -36967784 + .long -36991319 + .long -36987224 + .long -36988694 + .long -36983144 + .long -36984599 + .long -36984344 + .long -36984434 + .long -19088744 + .long -24261719 + .long -23213144 + .long -23536454 + .long -22164584 + .long -22487879 + .long -22422344 + .long -22442549 + .long -21116264 + .long -21439319 + .long -21373784 + .long -21393974 + .long -21308264 + .long -21328439 + .long -21324344 + .long -21325604 + .long -20071784 + .long -20390999 + .long -20325464 + .long -20345414 + .long -20259944 + .long -20279879 + .long -20275784 + .long -20277029 + .long -20194664 + .long -20214359 + .long -20210264 + .long -20211494 + .long -20206184 + .long -20207399 + .long -20207144 + .long -20207219 + .long -19088744 + .long -19346519 + .long -19280984 + .long -19297094 + .long -19215464 + .long -19231559 + .long -19227464 + .long -19228469 + .long -19150184 + .long -19166039 + .long -19161944 + .long -19162934 + .long -19157864 + .long -19158839 + .long -19158584 + .long -19158644 + .long -19088744 + .long -19100759 + .long -19096664 + .long -19097414 + .long -19092584 + .long -19093319 + .long -19093064 + .long -19093109 + .long -19088744 + .long -19089239 + .long -19088984 + .long -19089014 + .long -19088744 + .long -19088759 + .long -19088744 + .long -19088744 + .hidden _ZZN3hwy6N_SSE46detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .weak _ZZN3hwy6N_SSE46detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .section .rodata._ZZN3hwy6N_SSE46detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,"aG",@progbits,_ZZN3hwy6N_SSE46detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,comdat + .align 16 + .type _ZZN3hwy6N_SSE46detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, @object + .size _ZZN3hwy6N_SSE46detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, 256 +_ZZN3hwy6N_SSE46detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices: + .string "" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003" + .string "\001\002\003\b\t\n\013\f\r\016\017\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007" + .string "\001\002\003\004\005\006\007\f\r\016\017\b\t\n\013\004\005\006\007\f\r\016\017" + .string "\001\002\003\b\t\n\013" + .string "\001\002\003\f\r\016\017\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\004\005\006\007\b\t\n\013" + .string "\001\002\003\f\r\016\017" + .string "\001\002\003\b\t\n\013\004\005\006\007\f\r\016\017\b\t\n\013" + .string "\001\002\003\004\005\006\007\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\004\005\006\007" + .string "\001\002\003\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .ascii "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .hidden _ZZN3hwy7N_SSSE36detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .weak _ZZN3hwy7N_SSSE36detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .section .rodata._ZZN3hwy7N_SSSE36detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,"aG",@progbits,_ZZN3hwy7N_SSSE36detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,comdat + .align 16 + .type _ZZN3hwy7N_SSSE36detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, @object + .size _ZZN3hwy7N_SSSE36detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, 256 +_ZZN3hwy7N_SSSE36detail21IndicesFromNotBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices: + .string "" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003" + .string "\001\002\003\b\t\n\013\f\r\016\017\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007" + .string "\001\002\003\004\005\006\007\f\r\016\017\b\t\n\013\004\005\006\007\f\r\016\017" + .string "\001\002\003\b\t\n\013" + .string "\001\002\003\f\r\016\017\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\004\005\006\007\b\t\n\013" + .string "\001\002\003\f\r\016\017" + .string "\001\002\003\b\t\n\013\004\005\006\007\f\r\016\017\b\t\n\013" + .string "\001\002\003\004\005\006\007\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\004\005\006\007" + .string "\001\002\003\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .ascii "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .hidden _ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .weak _ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .section .rodata._ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,"aG",@progbits,_ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,comdat + .align 16 + .type _ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, @object + .size _ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, 256 +_ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices: + .string "" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\004\005\006\007" + .string "\001\002\003\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\b\t\n\013" + .string "\001\002\003\004\005\006\007\f\r\016\017" + .string "\001\002\003\b\t\n\013\004\005\006\007\f\r\016\017\004\005\006\007\b\t\n\013" + .string "\001\002\003\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013" + .string "\001\002\003\f\r\016\017\004\005\006\007\b\t\n\013\004\005\006\007\f\r\016\017" + .string "\001\002\003\b\t\n\013" + .string "\001\002\003\004\005\006\007\f\r\016\017\b\t\n\013\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007" + .string "\001\002\003\b\t\n\013\f\r\016\017\004\005\006\007\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003" + .ascii "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .hidden _ZZN3hwy6N_AVX26detail18IndicesFromBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array + .weak _ZZN3hwy6N_AVX26detail18IndicesFromBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array + .section .rodata._ZZN3hwy6N_AVX26detail18IndicesFromBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array,"aG",@progbits,_ZZN3hwy6N_AVX26detail18IndicesFromBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array,comdat + .align 16 + .type _ZZN3hwy6N_AVX26detail18IndicesFromBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array, @object + .size _ZZN3hwy6N_AVX26detail18IndicesFromBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array, 1024 +_ZZN3hwy6N_AVX26detail18IndicesFromBits256IiLPv0EEENS0_6Vec256IjEEmE12packed_array: + .long 1985229328 + .long 1985229336 + .long 1985229321 + .long 1985229464 + .long 1985229066 + .long 1985229224 + .long 1985228969 + .long 1985231512 + .long 1985224971 + .long 1985225144 + .long 1985224889 + .long 1985227672 + .long 1985220794 + .long 1985223592 + .long 1985219497 + .long 1985264280 + .long 1985159436 + .long 1985159624 + .long 1985159369 + .long 1985162392 + .long 1985155274 + .long 1985158312 + .long 1985154217 + .long 1985202840 + .long 1985089739 + .long 1985092792 + .long 1985088697 + .long 1985137560 + .long 1985023162 + .long 1985072040 + .long 1985006505 + .long 1985788568 + .long 1984110861 + .long 1984111064 + .long 1984110809 + .long 1984114072 + .long 1984106714 + .long 1984109992 + .long 1984105897 + .long 1984158360 + .long 1984041179 + .long 1984044472 + .long 1984040377 + .long 1984093080 + .long 1983974842 + .long 1984027560 + .long 1983962025 + .long 1984805528 + .long 1982992604 + .long 1982995912 + .long 1982991817 + .long 1983044760 + .long 1982926282 + .long 1982979240 + .long 1982913705 + .long 1983761048 + .long 1981877707 + .long 1981930680 + .long 1981865145 + .long 1982712728 + .long 1980816570 + .long 1981664168 + .long 1980615593 + .long 1994177176 + .long 1967333646 + .long 1967333864 + .long 1967333609 + .long 1967337112 + .long 1967329514 + .long 1967333032 + .long 1967328937 + .long 1967385240 + .long 1967263979 + .long 1967267512 + .long 1967263417 + .long 1967319960 + .long 1967197882 + .long 1967254440 + .long 1967188905 + .long 1968093848 + .long 1966215404 + .long 1966218952 + .long 1966214857 + .long 1966271640 + .long 1966149322 + .long 1966206120 + .long 1966140585 + .long 1967049368 + .long 1965100747 + .long 1965157560 + .long 1965092025 + .long 1966001048 + .long 1964043450 + .long 1964952488 + .long 1963903913 + .long 1978448536 + .long 1949438189 + .long 1949441752 + .long 1949437657 + .long 1949494680 + .long 1949372122 + .long 1949429160 + .long 1949363625 + .long 1950276248 + .long 1948323547 + .long 1948380600 + .long 1948315065 + .long 1949227928 + .long 1947266490 + .long 1948179368 + .long 1947130793 + .long 1961736856 + .long 1931546332 + .long 1931603400 + .long 1931537865 + .long 1932450968 + .long 1930489290 + .long 1931402408 + .long 1930353833 + .long 1944963736 + .long 1913712075 + .long 1914625208 + .long 1913576633 + .long 1928186776 + .long 1896799418 + .long 1911409576 + .long 1894632361 + .long 2128394904 + .long 1698898191 + .long 1698898424 + .long 1698898169 + .long 1698901912 + .long 1698894074 + .long 1698897832 + .long 1698893737 + .long 1698953880 + .long 1698828539 + .long 1698832312 + .long 1698828217 + .long 1698888600 + .long 1698762682 + .long 1698823080 + .long 1698757545 + .long 1699723928 + .long 1697779964 + .long 1697783752 + .long 1697779657 + .long 1697840280 + .long 1697714122 + .long 1697774760 + .long 1697709225 + .long 1698679448 + .long 1696665547 + .long 1696726200 + .long 1696660665 + .long 1697631128 + .long 1695612090 + .long 1696582568 + .long 1695533993 + .long 1711061656 + .long 1681002749 + .long 1681006552 + .long 1681002457 + .long 1681063320 + .long 1680936922 + .long 1680997800 + .long 1680932265 + .long 1681906328 + .long 1679888347 + .long 1679949240 + .long 1679883705 + .long 1680858008 + .long 1678835130 + .long 1679809448 + .long 1678760873 + .long 1694349976 + .long 1663111132 + .long 1663172040 + .long 1663106505 + .long 1664081048 + .long 1662057930 + .long 1663032488 + .long 1661983913 + .long 1677576856 + .long 1645280715 + .long 1646255288 + .long 1645206713 + .long 1660799896 + .long 1628429498 + .long 1644022696 + .long 1627245481 + .long 1876736664 + .long 1412567294 + .long 1412571112 + .long 1412567017 + .long 1412628120 + .long 1412501482 + .long 1412562600 + .long 1412497065 + .long 1413474968 + .long 1411452907 + .long 1411514040 + .long 1411448505 + .long 1412426648 + .long 1410399930 + .long 1411378088 + .long 1410329513 + .long 1425980056 + .long 1394675692 + .long 1394736840 + .long 1394671305 + .long 1395649688 + .long 1393622730 + .long 1394601128 + .long 1393552553 + .long 1409206936 + .long 1376845515 + .long 1377823928 + .long 1376775353 + .long 1392429976 + .long 1359998138 + .long 1375652776 + .long 1358875561 + .long 1609349784 + .long 1126240237 + .long 1126301400 + .long 1126235865 + .long 1127214488 + .long 1125187290 + .long 1126165928 + .long 1125117353 + .long 1140775576 + .long 1108410075 + .long 1109388728 + .long 1108340153 + .long 1123998616 + .long 1091562938 + .long 1107221416 + .long 1090444201 + .long 1340979864 + .long 839974620 + .long 840953288 + .long 839904713 + .long 855563416 + .long 823127498 + .long 838786216 + .long 822009001 + .long 1072548504 + .long 554692043 + .long 570350776 + .long 553573561 + .long 804113304 + .long 285138106 + .long 535677864 + .long 267242409 + .long -19088744 + .hidden _ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .weak _ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .section .rodata._ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,"aG",@progbits,_ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,comdat + .align 16 + .type _ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, @object + .size _ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, 256 +_ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices: + .string "" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\004\005\006\007" + .string "\001\002\003\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\b\t\n\013" + .string "\001\002\003\004\005\006\007\f\r\016\017" + .string "\001\002\003\b\t\n\013\004\005\006\007\f\r\016\017\004\005\006\007\b\t\n\013" + .string "\001\002\003\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013" + .string "\001\002\003\f\r\016\017\004\005\006\007\b\t\n\013\004\005\006\007\f\r\016\017" + .string "\001\002\003\b\t\n\013" + .string "\001\002\003\004\005\006\007\f\r\016\017\b\t\n\013\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007" + .string "\001\002\003\b\t\n\013\f\r\016\017\004\005\006\007\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003" + .ascii "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .hidden _ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .weak _ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .section .rodata._ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,"aG",@progbits,_ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,comdat + .align 16 + .type _ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, @object + .size _ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, 256 +_ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIiLm4ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices: + .string "" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\004\005\006\007" + .string "\001\002\003\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\b\t\n\013" + .string "\001\002\003\004\005\006\007\f\r\016\017" + .string "\001\002\003\b\t\n\013\004\005\006\007\f\r\016\017\004\005\006\007\b\t\n\013" + .string "\001\002\003\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013" + .string "\001\002\003\f\r\016\017\004\005\006\007\b\t\n\013\004\005\006\007\f\r\016\017" + .string "\001\002\003\b\t\n\013" + .string "\001\002\003\004\005\006\007\f\r\016\017\b\t\n\013\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007" + .string "\001\002\003\b\t\n\013\f\r\016\017\004\005\006\007\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003" + .ascii "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .set .LC0,.LC3 + .section .rodata + .align 64 +.LC1: + .long 7 + .long 6 + .long 5 + .long 4 + .long 3 + .long 2 + .long 1 + .long 0 + .long 15 + .long 14 + .long 13 + .long 12 + .long 11 + .long 10 + .long 9 + .long 8 + .align 64 +.LC2: + .long 15 + .long 14 + .long 13 + .long 12 + .long 11 + .long 10 + .long 9 + .long 8 + .long 7 + .long 6 + .long 5 + .long 4 + .long 3 + .long 2 + .long 1 + .long 0 + .section .rodata.cst32,"aM",@progbits,32 + .align 32 +.LC3: + .long 0 + .long 1 + .long 2 + .long 3 + .long 4 + .long 5 + .long 6 + .long 7 + .set .LC4,.LC9 + .set .LC5,.LC8 + .set .LC6,.LC9 + .section .rodata + .align 64 +.LC8: + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + .align 64 +.LC9: + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .long 2147483647 + .set .LC10,.LC9 + .set .LC13,.LC1 + .set .LC14,.LC9 + .set .LC15,.LC8 + .set .LC16,.LC9 + .section .rodata.cst32 + .align 32 +.LC17: + .long 0 + .long 4 + .long 8 + .long 12 + .long 16 + .long 20 + .long 24 + .long 28 diff --git a/third_party/vqsort/vqsort_i64a.S b/third_party/vqsort/vqsort_i64a.S new file mode 100644 index 000000000..515bb7e10 --- /dev/null +++ b/third_party/vqsort/vqsort_i64a.S @@ -0,0 +1,23679 @@ + .text + .globl __popcountdi2 + .section .text._ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, @function +_ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0: +.LFB18780: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + movq %rdi, %r14 + pushq %r13 + .cfi_offset 13, -40 + movq %rcx, %r13 + pushq %r12 + .cfi_offset 12, -48 + movq %rsi, %r12 + pushq %rbx + subq $88, %rsp + .cfi_offset 3, -56 + movq %rdx, -120(%rbp) + movaps %xmm0, -80(%rbp) + movaps %xmm1, -64(%rbp) + movaps %xmm0, -96(%rbp) + movaps %xmm1, -112(%rbp) + cmpq $1, %rsi + jbe .L24 + movl $2, %r15d + xorl %ebx, %ebx + jmp .L9 + .p2align 4,,10 + .p2align 3 +.L3: + movdqa -80(%rbp), %xmm5 + movmskpd %xmm1, %edi + movups %xmm5, (%r14,%rbx,8) + call __popcountdi2@PLT + cltq + addq %rax, %rbx + leaq 2(%r15), %rax + cmpq %r12, %rax + ja .L56 + movq %rax, %r15 +.L9: + movdqu -16(%r14,%r15,8), %xmm0 + leaq -2(%r15), %rdx + leaq 0(,%rbx,8), %rax + pcmpeqd -96(%rbp), %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm0, %xmm1 + movdqu -16(%r14,%r15,8), %xmm0 + pcmpeqd -112(%rbp), %xmm0 + pshufd $177, %xmm0, %xmm2 + pand %xmm2, %xmm0 + movdqa %xmm1, %xmm2 + por %xmm0, %xmm2 + movmskpd %xmm2, %ecx + cmpl $3, %ecx + je .L3 + pcmpeqd %xmm2, %xmm2 + movq -120(%rbp), %rsi + leaq 2(%rbx), %rdi + pxor %xmm2, %xmm0 + pandn %xmm0, %xmm1 + movmskpd %xmm1, %ecx + rep bsfl %ecx, %ecx + movslq %ecx, %rcx + addq %rdx, %rcx + movq (%r14,%rcx,8), %xmm0 + punpcklqdq %xmm0, %xmm0 + movaps %xmm0, (%rsi) + cmpq %rdx, %rdi + ja .L4 + movq %rdx, %rcx + addq %r14, %rax + subq %rbx, %rcx + leaq -2(%rcx), %rsi + movq %rsi, %rcx + andq $-2, %rcx + addq %rbx, %rcx + leaq 16(%r14,%rcx,8), %rcx + .p2align 4,,10 + .p2align 3 +.L5: + movdqa -64(%rbp), %xmm4 + addq $16, %rax + movups %xmm4, -16(%rax) + cmpq %rcx, %rax + jne .L5 + andq $-2, %rsi + leaq (%rsi,%rdi), %rbx +.L4: + subq %rbx, %rdx + movdqa .LC1(%rip), %xmm2 + movdqa .LC0(%rip), %xmm1 + leaq 0(,%rbx,8), %rcx + movq %rdx, %xmm0 + punpcklqdq %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + psubq %xmm0, %xmm1 + pcmpeqd %xmm2, %xmm3 + pcmpgtd %xmm2, %xmm0 + pand %xmm3, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L6 + movdqa -64(%rbp), %xmm3 + movq %xmm3, (%r14,%rbx,8) +.L6: + movhlps %xmm0, %xmm3 + movq %xmm3, %rax + testq %rax, %rax + jne .L57 +.L17: + addq $88, %rsp + xorl %eax, %eax + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L57: + .cfi_restore_state + movdqa -64(%rbp), %xmm3 + movhps %xmm3, 8(%r14,%rcx) + jmp .L17 + .p2align 4,,10 + .p2align 3 +.L56: + movq %r12, %rcx + leaq 0(,%r15,8), %rsi + leaq 0(,%rbx,8), %r9 + subq %r15, %rcx +.L2: + testq %rcx, %rcx + je .L13 + leaq 0(,%rcx,8), %rdx + addq %r14, %rsi + movq %r13, %rdi + movq %r9, -112(%rbp) + movq %rcx, -96(%rbp) + call memcpy@PLT + movq -96(%rbp), %rcx + movq -112(%rbp), %r9 +.L13: + movdqa .LC1(%rip), %xmm3 + movq %rcx, %xmm2 + movdqa -80(%rbp), %xmm5 + movdqa .LC0(%rip), %xmm1 + punpcklqdq %xmm2, %xmm2 + movdqa %xmm3, %xmm4 + pcmpeqd %xmm2, %xmm4 + movdqa %xmm1, %xmm0 + psubq %xmm2, %xmm0 + pcmpgtd %xmm3, %xmm2 + pand %xmm4, %xmm0 + por %xmm2, %xmm0 + movdqa 0(%r13), %xmm2 + pshufd $245, %xmm0, %xmm0 + pcmpeqd %xmm2, %xmm5 + pcmpeqd -64(%rbp), %xmm2 + pshufd $177, %xmm5, %xmm4 + pand %xmm0, %xmm4 + pand %xmm5, %xmm4 + pshufd $177, %xmm2, %xmm5 + pand %xmm5, %xmm2 + pcmpeqd %xmm5, %xmm5 + movdqa %xmm5, %xmm6 + pxor %xmm0, %xmm6 + por %xmm6, %xmm2 + por %xmm4, %xmm2 + movmskpd %xmm2, %eax + cmpl $3, %eax + jne .L58 + movq %xmm0, %rax + testq %rax, %rax + je .L18 + movdqa -80(%rbp), %xmm5 + movq %xmm5, (%r14,%r9) +.L18: + movhlps %xmm0, %xmm6 + movq %xmm6, %rax + testq %rax, %rax + jne .L59 +.L19: + movmskpd %xmm4, %edi + call __popcountdi2@PLT + movdqa .LC0(%rip), %xmm1 + movdqa .LC1(%rip), %xmm3 + movslq %eax, %rdx + addq %rbx, %rdx + leaq 2(%rdx), %rax + cmpq %rax, %r12 + jb .L20 + .p2align 4,,10 + .p2align 3 +.L21: + movdqa -64(%rbp), %xmm7 + movq %rax, %rdx + movups %xmm7, -16(%r14,%rax,8) + addq $2, %rax + cmpq %rax, %r12 + jnb .L21 +.L20: + subq %rdx, %r12 + movdqa %xmm3, %xmm2 + leaq 0(,%rdx,8), %rcx + movq %r12, %xmm0 + punpcklqdq %xmm0, %xmm0 + pcmpeqd %xmm0, %xmm2 + psubq %xmm0, %xmm1 + pcmpgtd %xmm3, %xmm0 + pand %xmm2, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L22 + movdqa -64(%rbp), %xmm3 + movq %xmm3, (%r14,%rdx,8) +.L22: + movhlps %xmm0, %xmm3 + movq %xmm3, %rax + testq %rax, %rax + je .L23 + movdqa -64(%rbp), %xmm3 + movhps %xmm3, 8(%r14,%rcx) +.L23: + addq $88, %rsp + movl $1, %eax + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L59: + .cfi_restore_state + movdqa -80(%rbp), %xmm6 + movhps %xmm6, 8(%r14,%r9) + jmp .L19 +.L24: + movq %rsi, %rcx + xorl %r9d, %r9d + xorl %esi, %esi + xorl %ebx, %ebx + xorl %r15d, %r15d + jmp .L2 +.L58: + pxor %xmm5, %xmm2 + leaq 2(%rbx), %rsi + movmskpd %xmm2, %eax + rep bsfl %eax, %eax + cltq + addq %r15, %rax + movq (%r14,%rax,8), %xmm0 + movq -120(%rbp), %rax + punpcklqdq %xmm0, %xmm0 + movaps %xmm0, (%rax) + cmpq %r15, %rsi + ja .L14 + leaq -2(%r15), %rcx + leaq (%r14,%rbx,8), %rax + subq %rbx, %rcx + movq %rcx, %rdx + andq $-2, %rdx + addq %rbx, %rdx + leaq 16(%r14,%rdx,8), %rdx + .p2align 4,,10 + .p2align 3 +.L15: + movdqa -64(%rbp), %xmm4 + addq $16, %rax + movups %xmm4, -16(%rax) + cmpq %rax, %rdx + jne .L15 + andq $-2, %rcx + leaq (%rcx,%rsi), %rbx + leaq 0(,%rbx,8), %r9 +.L14: + subq %rbx, %r15 + movdqa %xmm3, %xmm2 + movq %r15, %xmm0 + punpcklqdq %xmm0, %xmm0 + pcmpeqd %xmm0, %xmm2 + psubq %xmm0, %xmm1 + pcmpgtd %xmm3, %xmm0 + pand %xmm2, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L16 + movdqa -64(%rbp), %xmm3 + movq %xmm3, (%r14,%r9) +.L16: + movhlps %xmm0, %xmm3 + movq %xmm3, %rax + testq %rax, %rax + je .L17 + movdqa -64(%rbp), %xmm3 + movhps %xmm3, 8(%r14,%r9) + jmp .L17 + .cfi_endproc +.LFE18780: + .size _ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, .-_ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + .section .text._ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0, @function +_ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0: +.LFB18781: + .cfi_startproc + cmpq %rdx, %rsi + jbe .L60 + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r8 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rax, %rsi + jbe .L60 + movq (%rdi,%rdx,8), %r11 + movq %r11, %xmm3 + punpcklqdq %xmm3, %xmm3 + movdqa %xmm3, %xmm6 + jmp .L63 + .p2align 4,,10 + .p2align 3 +.L69: + movq %rdx, %rax +.L64: + cmpq %rcx, %rsi + jbe .L65 + salq $4, %r8 + movq (%rdi,%r8), %xmm0 + punpcklqdq %xmm0, %xmm0 + movdqa %xmm0, %xmm2 + psubq %xmm0, %xmm1 + pcmpeqd %xmm4, %xmm2 + pcmpgtd %xmm4, %xmm0 + pand %xmm2, %xmm1 + por %xmm0, %xmm1 + pshufd $245, %xmm1, %xmm1 + movmskpd %xmm1, %r8d + andl $1, %r8d + jne .L66 +.L65: + cmpq %rdx, %rax + je .L60 + leaq (%rdi,%rax,8), %rdx + movq (%rdx), %rcx + movq %rcx, (%r10) + movq %r11, (%rdx) + cmpq %rax, %rsi + jbe .L72 + movq %rax, %rdx +.L67: + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r8 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rsi, %rax + jnb .L60 +.L63: + movq (%rdi,%rax,8), %xmm2 + movdqa %xmm3, %xmm0 + leaq (%rdi,%rdx,8), %r10 + movdqa %xmm6, %xmm4 + movdqa %xmm3, %xmm1 + punpcklqdq %xmm2, %xmm2 + movdqa %xmm2, %xmm5 + psubq %xmm2, %xmm0 + pcmpeqd %xmm3, %xmm5 + pand %xmm5, %xmm0 + movdqa %xmm2, %xmm5 + pcmpgtd %xmm3, %xmm5 + por %xmm5, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %r9d + andl $1, %r9d + je .L69 + movdqa %xmm2, %xmm1 + movdqa %xmm2, %xmm4 + jmp .L64 + .p2align 4,,10 + .p2align 3 +.L66: + cmpq %rcx, %rdx + je .L60 + leaq (%rdi,%rcx,8), %rax + movq (%rax), %rdx + movq %rdx, (%r10) + movq %rcx, %rdx + movq %r11, (%rax) + jmp .L67 + .p2align 4,,10 + .p2align 3 +.L60: + ret + .p2align 4,,10 + .p2align 3 +.L72: + ret + .cfi_endproc +.LFE18781: + .size _ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0, .-_ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + .section .text._ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0, @function +_ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0: +.LFB18782: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsi, %rax + salq $3, %rax + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + leaq (%rdi,%rax), %r15 + pushq %r14 + .cfi_offset 14, -32 + leaq (%r15,%rax), %r14 + pushq %r13 + .cfi_offset 13, -40 + leaq (%r14,%rax), %r13 + pushq %r12 + .cfi_offset 12, -48 + leaq 0(%r13,%rax), %r12 + pushq %rbx + .cfi_offset 3, -56 + leaq (%r12,%rax), %rbx + leaq (%rbx,%rax), %r11 + leaq (%r11,%rax), %r10 + subq $224, %rsp + leaq (%r10,%rax), %r9 + leaq (%r9,%rax), %r8 + movq %rdi, -232(%rbp) + movq %rsi, -192(%rbp) + movdqu (%rdi), %xmm9 + leaq (%r8,%rax), %rdi + movdqu (%r15), %xmm6 + leaq (%rdi,%rax), %rsi + movdqu 0(%r13), %xmm5 + movdqu (%r14), %xmm13 + leaq (%rsi,%rax), %rcx + movdqa %xmm9, %xmm14 + movdqu (%rbx), %xmm3 + movdqu (%r12), %xmm12 + leaq (%rcx,%rax), %rdx + psubq %xmm6, %xmm14 + movdqu (%r10), %xmm2 + movdqu (%r11), %xmm11 + movdqu (%rdx), %xmm0 + movdqu (%rsi), %xmm4 + movq %rdx, -216(%rbp) + addq %rax, %rdx + movdqu (%rdx), %xmm15 + movdqu (%r8), %xmm8 + movq %rdx, -224(%rbp) + addq %rdx, %rax + movaps %xmm0, -96(%rbp) + movdqa %xmm6, %xmm0 + movdqu (%rdi), %xmm7 + movdqu (%rcx), %xmm1 + pcmpeqd %xmm9, %xmm0 + movaps %xmm15, -112(%rbp) + movdqu (%r9), %xmm10 + pand %xmm14, %xmm0 + movdqa %xmm6, %xmm14 + pcmpgtd %xmm9, %xmm14 + por %xmm14, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm14 + pandn %xmm6, %xmm14 + pand %xmm0, %xmm6 + movdqa %xmm14, %xmm15 + movdqa %xmm9, %xmm14 + pand %xmm0, %xmm14 + por %xmm15, %xmm14 + movdqa %xmm0, %xmm15 + movdqa %xmm5, %xmm0 + pcmpeqd %xmm13, %xmm0 + pandn %xmm9, %xmm15 + movdqa %xmm13, %xmm9 + psubq %xmm5, %xmm9 + por %xmm15, %xmm6 + pand %xmm9, %xmm0 + movdqa %xmm5, %xmm9 + pcmpgtd %xmm13, %xmm9 + por %xmm9, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm9 + pandn %xmm5, %xmm9 + pand %xmm0, %xmm5 + movdqa %xmm9, %xmm15 + movdqa %xmm13, %xmm9 + pand %xmm0, %xmm9 + por %xmm15, %xmm9 + movdqa %xmm0, %xmm15 + movdqa %xmm3, %xmm0 + pcmpeqd %xmm12, %xmm0 + pandn %xmm13, %xmm15 + movdqa %xmm12, %xmm13 + psubq %xmm3, %xmm13 + por %xmm15, %xmm5 + pand %xmm13, %xmm0 + movdqa %xmm3, %xmm13 + pcmpgtd %xmm12, %xmm13 + por %xmm13, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm13 + pandn %xmm3, %xmm13 + pand %xmm0, %xmm3 + movdqa %xmm13, %xmm15 + movdqa %xmm12, %xmm13 + pand %xmm0, %xmm13 + por %xmm15, %xmm13 + movdqa %xmm0, %xmm15 + movdqa %xmm2, %xmm0 + pandn %xmm12, %xmm15 + pcmpeqd %xmm11, %xmm0 + por %xmm15, %xmm3 + movaps %xmm3, -64(%rbp) + movdqa %xmm11, %xmm3 + psubq %xmm2, %xmm3 + pand %xmm3, %xmm0 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm11, %xmm3 + por %xmm3, %xmm0 + movdqa %xmm11, %xmm3 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm12 + pand %xmm0, %xmm3 + pandn %xmm2, %xmm12 + pand %xmm0, %xmm2 + por %xmm12, %xmm3 + movdqa %xmm0, %xmm12 + movdqa %xmm8, %xmm0 + pcmpeqd %xmm10, %xmm0 + pandn %xmm11, %xmm12 + movdqa %xmm10, %xmm11 + psubq %xmm8, %xmm11 + por %xmm12, %xmm2 + movdqa %xmm10, %xmm12 + pand %xmm11, %xmm0 + movdqa %xmm8, %xmm11 + pcmpgtd %xmm10, %xmm11 + por %xmm11, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm11 + pand %xmm0, %xmm12 + pandn %xmm8, %xmm11 + pand %xmm0, %xmm8 + por %xmm11, %xmm12 + movdqa %xmm0, %xmm11 + movdqa %xmm4, %xmm0 + pandn %xmm10, %xmm11 + pcmpeqd %xmm7, %xmm0 + por %xmm11, %xmm8 + movaps %xmm8, -80(%rbp) + movdqa %xmm7, %xmm8 + movdqa -96(%rbp), %xmm10 + movdqa -112(%rbp), %xmm15 + psubq %xmm4, %xmm8 + pand %xmm8, %xmm0 + movdqa %xmm4, %xmm8 + pcmpgtd %xmm7, %xmm8 + por %xmm8, %xmm0 + movdqa %xmm7, %xmm8 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm11 + pand %xmm0, %xmm8 + pandn %xmm4, %xmm11 + pand %xmm0, %xmm4 + por %xmm11, %xmm8 + movdqa %xmm0, %xmm11 + movdqa %xmm10, %xmm0 + pcmpeqd %xmm1, %xmm0 + pandn %xmm7, %xmm11 + movdqa %xmm1, %xmm7 + psubq %xmm10, %xmm7 + por %xmm11, %xmm4 + movdqa %xmm1, %xmm11 + pand %xmm7, %xmm0 + movdqa %xmm10, %xmm7 + pcmpgtd %xmm1, %xmm7 + por %xmm7, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm7 + pand %xmm0, %xmm11 + pandn %xmm10, %xmm7 + por %xmm7, %xmm11 + movdqa %xmm0, %xmm7 + pand %xmm10, %xmm0 + movdqu (%rax), %xmm10 + pandn %xmm1, %xmm7 + movdqa %xmm15, %xmm1 + por %xmm7, %xmm0 + movdqu (%rax), %xmm7 + movaps %xmm0, -96(%rbp) + psubq %xmm7, %xmm1 + movdqu (%rax), %xmm7 + movdqa %xmm1, %xmm0 + movdqu (%rax), %xmm1 + pcmpgtd %xmm15, %xmm7 + pcmpeqd %xmm15, %xmm1 + pand %xmm0, %xmm1 + movdqa %xmm15, %xmm0 + por %xmm7, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm7 + pand %xmm1, %xmm0 + pandn %xmm10, %xmm7 + por %xmm7, %xmm0 + movdqa %xmm1, %xmm7 + pand %xmm10, %xmm1 + pandn %xmm15, %xmm7 + movdqa %xmm14, %xmm10 + movdqa %xmm14, %xmm15 + por %xmm7, %xmm1 + movdqa %xmm9, %xmm7 + psubq %xmm9, %xmm10 + pcmpeqd %xmm14, %xmm7 + pand %xmm10, %xmm7 + movdqa %xmm9, %xmm10 + pcmpgtd %xmm14, %xmm10 + por %xmm10, %xmm7 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm10 + pand %xmm7, %xmm15 + pandn %xmm9, %xmm10 + pand %xmm7, %xmm9 + por %xmm15, %xmm10 + movdqa %xmm7, %xmm15 + movdqa %xmm5, %xmm7 + pcmpeqd %xmm6, %xmm7 + pandn %xmm14, %xmm15 + movdqa %xmm6, %xmm14 + por %xmm9, %xmm15 + movdqa %xmm6, %xmm9 + psubq %xmm5, %xmm9 + pand %xmm9, %xmm7 + movdqa %xmm5, %xmm9 + pcmpgtd %xmm6, %xmm9 + por %xmm9, %xmm7 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm9 + pand %xmm7, %xmm14 + pandn %xmm5, %xmm9 + pand %xmm7, %xmm5 + por %xmm14, %xmm9 + movdqa %xmm7, %xmm14 + pandn %xmm6, %xmm14 + movdqa %xmm3, %xmm6 + por %xmm14, %xmm5 + pcmpeqd %xmm13, %xmm6 + movdqa -64(%rbp), %xmm14 + movaps %xmm5, -112(%rbp) + movdqa %xmm13, %xmm5 + psubq %xmm3, %xmm5 + pand %xmm5, %xmm6 + movdqa %xmm3, %xmm5 + pcmpgtd %xmm13, %xmm5 + por %xmm5, %xmm6 + movdqa %xmm13, %xmm5 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm7 + pand %xmm6, %xmm5 + pandn %xmm3, %xmm7 + pand %xmm6, %xmm3 + por %xmm7, %xmm5 + movdqa %xmm6, %xmm7 + movdqa %xmm14, %xmm6 + pandn %xmm13, %xmm7 + pcmpeqd %xmm2, %xmm6 + por %xmm7, %xmm3 + movdqa %xmm14, %xmm7 + psubq %xmm2, %xmm7 + pand %xmm7, %xmm6 + movdqa %xmm2, %xmm7 + pcmpgtd %xmm14, %xmm7 + por %xmm7, %xmm6 + movdqa %xmm14, %xmm7 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm13 + pand %xmm6, %xmm7 + pandn %xmm2, %xmm13 + pand %xmm6, %xmm2 + por %xmm13, %xmm7 + movdqa %xmm6, %xmm13 + movdqa %xmm8, %xmm6 + pandn %xmm14, %xmm13 + pcmpeqd %xmm12, %xmm6 + movdqa %xmm12, %xmm14 + por %xmm13, %xmm2 + movdqa %xmm12, %xmm13 + psubq %xmm8, %xmm13 + pand %xmm13, %xmm6 + movdqa %xmm8, %xmm13 + pcmpgtd %xmm12, %xmm13 + por %xmm13, %xmm6 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm13 + pand %xmm6, %xmm14 + pandn %xmm8, %xmm13 + pand %xmm6, %xmm8 + por %xmm14, %xmm13 + movdqa %xmm6, %xmm14 + movdqa %xmm8, %xmm6 + pandn %xmm12, %xmm14 + por %xmm14, %xmm6 + movdqa -80(%rbp), %xmm14 + movaps %xmm6, -128(%rbp) + movdqa %xmm14, %xmm6 + movdqa %xmm14, %xmm8 + movdqa %xmm14, %xmm12 + pcmpeqd %xmm4, %xmm6 + psubq %xmm4, %xmm8 + pand %xmm8, %xmm6 + movdqa %xmm4, %xmm8 + pcmpgtd %xmm14, %xmm8 + por %xmm8, %xmm6 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm8 + pand %xmm6, %xmm12 + pandn %xmm4, %xmm8 + pand %xmm6, %xmm4 + por %xmm12, %xmm8 + movdqa %xmm6, %xmm12 + movdqa %xmm0, %xmm6 + pandn %xmm14, %xmm12 + pcmpeqd %xmm11, %xmm6 + movdqa -96(%rbp), %xmm14 + por %xmm12, %xmm4 + movaps %xmm4, -80(%rbp) + movdqa %xmm11, %xmm4 + psubq %xmm0, %xmm4 + pand %xmm4, %xmm6 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm11, %xmm4 + por %xmm4, %xmm6 + movdqa %xmm11, %xmm4 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm12 + pand %xmm6, %xmm4 + pandn %xmm0, %xmm12 + pand %xmm6, %xmm0 + por %xmm12, %xmm4 + movdqa %xmm6, %xmm12 + movdqa %xmm14, %xmm6 + pandn %xmm11, %xmm12 + movdqa %xmm14, %xmm11 + psubq %xmm1, %xmm6 + pcmpeqd %xmm1, %xmm11 + por %xmm12, %xmm0 + pand %xmm6, %xmm11 + movdqa %xmm1, %xmm6 + pcmpgtd %xmm14, %xmm6 + por %xmm6, %xmm11 + movdqa %xmm14, %xmm6 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm6 + pandn %xmm1, %xmm12 + pand %xmm11, %xmm1 + por %xmm12, %xmm6 + movdqa %xmm11, %xmm12 + movdqa %xmm5, %xmm11 + pandn %xmm14, %xmm12 + pcmpeqd %xmm10, %xmm11 + movdqa %xmm10, %xmm14 + por %xmm12, %xmm1 + movdqa %xmm10, %xmm12 + psubq %xmm5, %xmm12 + pand %xmm12, %xmm11 + movdqa %xmm5, %xmm12 + pcmpgtd %xmm10, %xmm12 + por %xmm12, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm14 + pandn %xmm5, %xmm12 + pand %xmm11, %xmm5 + por %xmm14, %xmm12 + movdqa %xmm11, %xmm14 + movdqa %xmm9, %xmm11 + pandn %xmm10, %xmm14 + movdqa %xmm9, %xmm10 + por %xmm5, %xmm14 + movdqa %xmm7, %xmm5 + psubq %xmm7, %xmm10 + pcmpeqd %xmm9, %xmm5 + pand %xmm10, %xmm5 + movdqa %xmm7, %xmm10 + pcmpgtd %xmm9, %xmm10 + por %xmm10, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm10 + pand %xmm5, %xmm11 + pandn %xmm7, %xmm10 + pand %xmm5, %xmm7 + por %xmm11, %xmm10 + movdqa %xmm5, %xmm11 + movdqa %xmm3, %xmm5 + pandn %xmm9, %xmm11 + pcmpeqd %xmm15, %xmm5 + movdqa %xmm15, %xmm9 + por %xmm11, %xmm7 + movaps %xmm7, -96(%rbp) + movdqa %xmm15, %xmm7 + psubq %xmm3, %xmm7 + pand %xmm7, %xmm5 + movdqa %xmm3, %xmm7 + pcmpgtd %xmm15, %xmm7 + por %xmm7, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm7 + pand %xmm5, %xmm9 + pandn %xmm3, %xmm7 + pand %xmm5, %xmm3 + por %xmm9, %xmm7 + movdqa %xmm5, %xmm9 + pandn %xmm15, %xmm9 + movdqa -112(%rbp), %xmm15 + por %xmm9, %xmm3 + movaps %xmm3, -144(%rbp) + movdqa %xmm15, %xmm3 + movdqa %xmm15, %xmm5 + movdqa %xmm15, %xmm9 + pcmpeqd %xmm2, %xmm3 + psubq %xmm2, %xmm5 + pand %xmm5, %xmm3 + movdqa %xmm2, %xmm5 + pcmpgtd %xmm15, %xmm5 + por %xmm5, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm5 + pand %xmm3, %xmm9 + pandn %xmm2, %xmm5 + pand %xmm3, %xmm2 + por %xmm9, %xmm5 + movdqa %xmm3, %xmm9 + movdqa %xmm13, %xmm3 + pandn %xmm15, %xmm9 + psubq %xmm4, %xmm3 + movdqa -128(%rbp), %xmm15 + por %xmm9, %xmm2 + movaps %xmm2, -64(%rbp) + movdqa %xmm4, %xmm2 + pcmpeqd %xmm13, %xmm2 + pand %xmm3, %xmm2 + movdqa %xmm4, %xmm3 + pcmpgtd %xmm13, %xmm3 + por %xmm3, %xmm2 + movdqa %xmm13, %xmm3 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm9 + pand %xmm2, %xmm3 + pandn %xmm4, %xmm9 + pand %xmm2, %xmm4 + por %xmm9, %xmm3 + movdqa %xmm2, %xmm9 + movdqa %xmm6, %xmm2 + pandn %xmm13, %xmm9 + pcmpeqd %xmm8, %xmm2 + por %xmm9, %xmm4 + movdqa %xmm8, %xmm9 + psubq %xmm6, %xmm9 + pand %xmm9, %xmm2 + movdqa %xmm6, %xmm9 + pcmpgtd %xmm8, %xmm9 + por %xmm9, %xmm2 + movdqa %xmm8, %xmm9 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm11 + pand %xmm2, %xmm9 + pandn %xmm6, %xmm11 + pand %xmm2, %xmm6 + por %xmm11, %xmm9 + movdqa %xmm2, %xmm11 + movdqa %xmm15, %xmm2 + pandn %xmm8, %xmm11 + movdqa %xmm15, %xmm8 + psubq %xmm0, %xmm2 + pcmpeqd %xmm0, %xmm8 + por %xmm11, %xmm6 + pand %xmm2, %xmm8 + movdqa %xmm0, %xmm2 + pcmpgtd %xmm15, %xmm2 + por %xmm2, %xmm8 + movdqa %xmm15, %xmm2 + pshufd $245, %xmm8, %xmm8 + movdqa %xmm8, %xmm11 + pand %xmm8, %xmm2 + pandn %xmm0, %xmm11 + pand %xmm8, %xmm0 + por %xmm11, %xmm2 + movdqa %xmm8, %xmm11 + pandn %xmm15, %xmm11 + movdqa -80(%rbp), %xmm15 + por %xmm11, %xmm0 + movdqa %xmm15, %xmm11 + movdqa %xmm15, %xmm8 + pcmpeqd %xmm1, %xmm11 + psubq %xmm1, %xmm8 + pand %xmm8, %xmm11 + movdqa %xmm1, %xmm8 + pcmpgtd %xmm15, %xmm8 + por %xmm8, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm8 + pandn %xmm1, %xmm8 + pand %xmm11, %xmm1 + movdqa %xmm8, %xmm13 + movdqa %xmm15, %xmm8 + pand %xmm11, %xmm8 + por %xmm13, %xmm8 + movdqa %xmm11, %xmm13 + movdqa %xmm3, %xmm11 + pcmpeqd %xmm12, %xmm11 + pandn %xmm15, %xmm13 + movdqa %xmm12, %xmm15 + psubq %xmm3, %xmm15 + por %xmm13, %xmm1 + pand %xmm15, %xmm11 + movdqa %xmm3, %xmm15 + pcmpgtd %xmm12, %xmm15 + por %xmm15, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm15 + pandn %xmm3, %xmm15 + pand %xmm11, %xmm3 + movdqa %xmm15, %xmm13 + movdqa %xmm12, %xmm15 + pand %xmm11, %xmm15 + por %xmm13, %xmm15 + movaps %xmm15, -112(%rbp) + movdqa %xmm11, %xmm15 + movdqa %xmm10, %xmm11 + pandn %xmm12, %xmm15 + psubq %xmm9, %xmm11 + movdqa %xmm10, %xmm12 + movdqa %xmm15, %xmm13 + movdqa %xmm3, %xmm15 + movdqa %xmm9, %xmm3 + pcmpeqd %xmm10, %xmm3 + por %xmm13, %xmm15 + movdqa -96(%rbp), %xmm13 + pand %xmm11, %xmm3 + movdqa %xmm9, %xmm11 + pcmpgtd %xmm10, %xmm11 + por %xmm11, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm11 + pand %xmm3, %xmm12 + pandn %xmm9, %xmm11 + pand %xmm3, %xmm9 + por %xmm11, %xmm12 + movdqa %xmm3, %xmm11 + movdqa %xmm7, %xmm3 + pandn %xmm10, %xmm11 + movdqa %xmm2, %xmm10 + psubq %xmm2, %xmm3 + movaps %xmm12, -80(%rbp) + pcmpeqd %xmm7, %xmm10 + por %xmm11, %xmm9 + pand %xmm3, %xmm10 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm7, %xmm3 + por %xmm3, %xmm10 + movdqa %xmm7, %xmm3 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm11 + pand %xmm10, %xmm3 + pandn %xmm2, %xmm11 + pand %xmm10, %xmm2 + por %xmm11, %xmm3 + movdqa %xmm10, %xmm11 + movdqa %xmm8, %xmm10 + pcmpeqd %xmm5, %xmm10 + pandn %xmm7, %xmm11 + movdqa %xmm5, %xmm7 + psubq %xmm8, %xmm7 + por %xmm11, %xmm2 + pand %xmm7, %xmm10 + movdqa %xmm8, %xmm7 + pcmpgtd %xmm5, %xmm7 + por %xmm7, %xmm10 + movdqa %xmm5, %xmm7 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm11 + pand %xmm10, %xmm7 + pandn %xmm8, %xmm11 + pand %xmm10, %xmm8 + por %xmm11, %xmm7 + movdqa %xmm10, %xmm11 + movdqa %xmm4, %xmm10 + pcmpeqd %xmm14, %xmm10 + pandn %xmm5, %xmm11 + movdqa %xmm14, %xmm5 + psubq %xmm4, %xmm5 + por %xmm11, %xmm8 + pand %xmm5, %xmm10 + movdqa %xmm4, %xmm5 + pcmpgtd %xmm14, %xmm5 + por %xmm5, %xmm10 + movdqa %xmm14, %xmm5 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm11 + pand %xmm10, %xmm5 + pandn %xmm4, %xmm11 + pand %xmm10, %xmm4 + por %xmm11, %xmm5 + movdqa %xmm10, %xmm11 + movdqa %xmm13, %xmm10 + pandn %xmm14, %xmm11 + psubq %xmm6, %xmm10 + movdqa -144(%rbp), %xmm14 + por %xmm11, %xmm4 + movdqa %xmm13, %xmm11 + pcmpeqd %xmm6, %xmm11 + pand %xmm10, %xmm11 + movdqa %xmm6, %xmm10 + pcmpgtd %xmm13, %xmm10 + por %xmm10, %xmm11 + movdqa %xmm13, %xmm10 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm10 + pandn %xmm6, %xmm12 + pand %xmm11, %xmm6 + por %xmm12, %xmm10 + movdqa %xmm11, %xmm12 + movdqa %xmm14, %xmm11 + pandn %xmm13, %xmm12 + pcmpeqd %xmm0, %xmm11 + por %xmm12, %xmm6 + movdqa %xmm14, %xmm12 + psubq %xmm0, %xmm12 + pand %xmm12, %xmm11 + movdqa %xmm0, %xmm12 + pcmpgtd %xmm14, %xmm12 + por %xmm12, %xmm11 + movdqa %xmm14, %xmm12 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm13 + pand %xmm11, %xmm12 + pandn %xmm0, %xmm13 + pand %xmm11, %xmm0 + por %xmm13, %xmm12 + movdqa %xmm11, %xmm13 + pandn %xmm14, %xmm13 + movdqa -64(%rbp), %xmm14 + por %xmm13, %xmm0 + movdqa %xmm14, %xmm11 + movdqa %xmm14, %xmm13 + movaps %xmm0, -96(%rbp) + pcmpeqd %xmm1, %xmm11 + psubq %xmm1, %xmm13 + pand %xmm13, %xmm11 + movdqa %xmm1, %xmm13 + pcmpgtd %xmm14, %xmm13 + por %xmm13, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm13 + movdqa %xmm11, %xmm0 + pandn -64(%rbp), %xmm0 + pandn %xmm1, %xmm13 + pand %xmm11, %xmm1 + pand %xmm11, %xmm14 + por %xmm0, %xmm1 + movdqa %xmm10, %xmm11 + por %xmm14, %xmm13 + movaps %xmm1, -160(%rbp) + movdqa %xmm2, %xmm1 + psubq %xmm2, %xmm11 + movdqa %xmm7, %xmm0 + pcmpeqd %xmm10, %xmm1 + psubq %xmm4, %xmm0 + pand %xmm11, %xmm1 + movdqa %xmm2, %xmm11 + pcmpgtd %xmm10, %xmm11 + por %xmm11, %xmm1 + movdqa %xmm10, %xmm11 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm14 + pand %xmm1, %xmm11 + pandn %xmm2, %xmm14 + pand %xmm1, %xmm2 + por %xmm14, %xmm11 + movdqa %xmm1, %xmm14 + movdqa %xmm12, %xmm1 + pandn %xmm10, %xmm14 + movdqa %xmm9, %xmm10 + psubq %xmm9, %xmm1 + pcmpeqd %xmm12, %xmm10 + por %xmm14, %xmm2 + pand %xmm1, %xmm10 + movdqa %xmm9, %xmm1 + pcmpgtd %xmm12, %xmm1 + por %xmm1, %xmm10 + movdqa %xmm12, %xmm1 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm14 + pand %xmm10, %xmm1 + pandn %xmm9, %xmm14 + pand %xmm10, %xmm9 + por %xmm14, %xmm1 + movdqa %xmm10, %xmm14 + movdqa %xmm4, %xmm10 + pcmpeqd %xmm7, %xmm10 + pandn %xmm12, %xmm14 + por %xmm14, %xmm9 + movdqa %xmm7, %xmm14 + pand %xmm0, %xmm10 + movdqa %xmm4, %xmm0 + pcmpgtd %xmm7, %xmm0 + por %xmm0, %xmm10 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm0 + pand %xmm10, %xmm14 + pandn %xmm4, %xmm0 + pand %xmm10, %xmm4 + por %xmm0, %xmm14 + movdqa %xmm10, %xmm0 + pandn %xmm7, %xmm0 + movdqa %xmm13, %xmm7 + por %xmm0, %xmm4 + psubq %xmm8, %xmm7 + movaps %xmm4, -64(%rbp) + movdqa %xmm8, %xmm4 + pcmpeqd %xmm13, %xmm4 + pand %xmm7, %xmm4 + movdqa %xmm8, %xmm7 + pcmpgtd %xmm13, %xmm7 + por %xmm7, %xmm4 + movdqa %xmm13, %xmm7 + pshufd $245, %xmm4, %xmm4 + movdqa %xmm4, %xmm10 + pand %xmm4, %xmm7 + pandn %xmm8, %xmm10 + pand %xmm4, %xmm8 + por %xmm10, %xmm7 + movdqa %xmm4, %xmm10 + pandn %xmm13, %xmm10 + movdqa -96(%rbp), %xmm13 + por %xmm10, %xmm8 + movdqa %xmm6, %xmm10 + movdqa %xmm13, %xmm4 + psubq %xmm13, %xmm10 + pcmpeqd %xmm6, %xmm4 + pand %xmm10, %xmm4 + movdqa %xmm13, %xmm10 + pcmpgtd %xmm6, %xmm10 + por %xmm10, %xmm4 + movdqa %xmm6, %xmm10 + pshufd $245, %xmm4, %xmm4 + movdqa %xmm4, %xmm0 + pand %xmm4, %xmm10 + pandn %xmm13, %xmm0 + por %xmm0, %xmm10 + movdqa %xmm4, %xmm0 + pandn %xmm6, %xmm0 + movdqa %xmm15, %xmm6 + movdqa %xmm0, %xmm12 + pcmpeqd %xmm5, %xmm6 + movdqa %xmm13, %xmm0 + movdqa -80(%rbp), %xmm13 + pand %xmm4, %xmm0 + movdqa %xmm5, %xmm4 + psubq %xmm15, %xmm4 + por %xmm12, %xmm0 + pand %xmm4, %xmm6 + movdqa %xmm15, %xmm4 + pcmpgtd %xmm5, %xmm4 + por %xmm4, %xmm6 + movdqa %xmm5, %xmm4 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm12 + pand %xmm6, %xmm4 + pandn %xmm15, %xmm12 + pand %xmm6, %xmm15 + por %xmm12, %xmm4 + movdqa %xmm6, %xmm12 + movdqa %xmm13, %xmm6 + pandn %xmm5, %xmm12 + movdqa %xmm13, %xmm5 + psubq %xmm3, %xmm6 + pcmpeqd %xmm3, %xmm5 + por %xmm12, %xmm15 + pand %xmm6, %xmm5 + movdqa %xmm3, %xmm6 + pcmpgtd %xmm13, %xmm6 + por %xmm6, %xmm5 + movdqa %xmm13, %xmm6 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm12 + pand %xmm5, %xmm6 + pandn %xmm3, %xmm12 + pand %xmm5, %xmm3 + por %xmm12, %xmm6 + movdqa %xmm5, %xmm12 + movdqa %xmm4, %xmm5 + pandn %xmm13, %xmm12 + pcmpeqd %xmm6, %xmm5 + movdqa %xmm6, %xmm13 + por %xmm12, %xmm3 + movdqa %xmm6, %xmm12 + psubq %xmm4, %xmm12 + pand %xmm12, %xmm5 + movdqa %xmm4, %xmm12 + pcmpgtd %xmm6, %xmm12 + por %xmm12, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm12 + pand %xmm5, %xmm13 + pandn %xmm4, %xmm12 + pand %xmm5, %xmm4 + por %xmm12, %xmm13 + movdqa %xmm5, %xmm12 + movdqa %xmm7, %xmm5 + pandn %xmm6, %xmm12 + movdqa %xmm10, %xmm6 + psubq %xmm10, %xmm5 + movaps %xmm13, -128(%rbp) + pcmpeqd %xmm7, %xmm6 + por %xmm12, %xmm4 + pand %xmm5, %xmm6 + movdqa %xmm10, %xmm5 + pcmpgtd %xmm7, %xmm5 + por %xmm5, %xmm6 + movdqa %xmm7, %xmm5 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm12 + pand %xmm6, %xmm5 + pandn %xmm10, %xmm12 + pand %xmm6, %xmm10 + por %xmm12, %xmm5 + movdqa %xmm6, %xmm12 + movdqa %xmm3, %xmm6 + pandn %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + psubq %xmm15, %xmm6 + pcmpeqd %xmm3, %xmm7 + por %xmm12, %xmm10 + pand %xmm6, %xmm7 + movdqa %xmm15, %xmm6 + pcmpgtd %xmm3, %xmm6 + por %xmm6, %xmm7 + movdqa %xmm3, %xmm6 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm12 + pand %xmm7, %xmm6 + pandn %xmm15, %xmm12 + pand %xmm7, %xmm15 + por %xmm12, %xmm6 + movdqa %xmm7, %xmm12 + movdqa %xmm0, %xmm7 + pcmpeqd %xmm8, %xmm7 + pandn %xmm3, %xmm12 + movdqa %xmm8, %xmm3 + psubq %xmm0, %xmm3 + por %xmm12, %xmm15 + pand %xmm3, %xmm7 + movdqa %xmm0, %xmm3 + pcmpgtd %xmm8, %xmm3 + por %xmm3, %xmm7 + movdqa %xmm8, %xmm3 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm12 + pand %xmm7, %xmm3 + pandn %xmm0, %xmm12 + pand %xmm7, %xmm0 + por %xmm12, %xmm3 + movdqa %xmm7, %xmm12 + movdqa %xmm6, %xmm7 + pandn %xmm8, %xmm12 + psubq %xmm4, %xmm7 + movdqa %xmm6, %xmm8 + por %xmm12, %xmm0 + movdqa %xmm5, %xmm12 + movaps %xmm0, -176(%rbp) + movdqa %xmm4, %xmm0 + pcmpeqd %xmm6, %xmm0 + pand %xmm7, %xmm0 + movdqa %xmm4, %xmm7 + pcmpgtd %xmm6, %xmm7 + por %xmm7, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm7 + pand %xmm0, %xmm8 + pandn %xmm4, %xmm7 + pand %xmm0, %xmm4 + por %xmm7, %xmm8 + movdqa %xmm0, %xmm7 + movdqa %xmm11, %xmm0 + pandn %xmm6, %xmm7 + movdqa %xmm1, %xmm6 + psubq %xmm1, %xmm0 + movaps %xmm8, -144(%rbp) + pcmpeqd %xmm11, %xmm6 + por %xmm7, %xmm4 + movdqa %xmm9, %xmm8 + cmpq $1, -192(%rbp) + pand %xmm0, %xmm6 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm11, %xmm0 + por %xmm0, %xmm6 + movdqa %xmm11, %xmm0 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm7 + pand %xmm6, %xmm0 + pandn %xmm1, %xmm7 + pand %xmm6, %xmm1 + por %xmm7, %xmm0 + movdqa %xmm6, %xmm7 + pandn %xmm11, %xmm7 + por %xmm7, %xmm1 + movdqa %xmm9, %xmm7 + movdqa %xmm1, %xmm6 + movdqa %xmm2, %xmm1 + psubq %xmm2, %xmm7 + pcmpeqd %xmm9, %xmm1 + pand %xmm7, %xmm1 + movdqa %xmm2, %xmm7 + pcmpgtd %xmm9, %xmm7 + por %xmm7, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm7 + pand %xmm1, %xmm8 + pandn %xmm2, %xmm7 + pand %xmm1, %xmm2 + por %xmm7, %xmm8 + movdqa %xmm1, %xmm7 + movdqa %xmm10, %xmm1 + pandn %xmm9, %xmm7 + pcmpeqd %xmm3, %xmm1 + por %xmm7, %xmm2 + movdqa %xmm3, %xmm7 + psubq %xmm10, %xmm7 + pand %xmm7, %xmm1 + movdqa %xmm10, %xmm7 + pcmpgtd %xmm3, %xmm7 + por %xmm7, %xmm1 + movdqa %xmm3, %xmm7 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm9 + pand %xmm1, %xmm7 + pandn %xmm10, %xmm9 + pand %xmm1, %xmm10 + por %xmm9, %xmm7 + movdqa %xmm1, %xmm9 + movdqa %xmm14, %xmm1 + pandn %xmm3, %xmm9 + movdqa %xmm15, %xmm3 + psubq %xmm15, %xmm1 + pcmpeqd %xmm14, %xmm3 + movdqa %xmm10, %xmm13 + movdqa %xmm14, %xmm10 + por %xmm9, %xmm13 + movdqa %xmm0, %xmm9 + pand %xmm1, %xmm3 + movdqa %xmm15, %xmm1 + pcmpgtd %xmm14, %xmm1 + por %xmm1, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm1 + pand %xmm3, %xmm10 + pandn %xmm15, %xmm1 + pand %xmm3, %xmm15 + por %xmm1, %xmm10 + movdqa %xmm3, %xmm1 + movdqa %xmm5, %xmm3 + pandn %xmm14, %xmm1 + movdqa -64(%rbp), %xmm14 + pcmpeqd %xmm10, %xmm9 + movdqa %xmm10, %xmm11 + por %xmm1, %xmm15 + movdqa %xmm14, %xmm1 + psubq %xmm14, %xmm3 + pcmpeqd %xmm5, %xmm1 + pand %xmm3, %xmm1 + movdqa %xmm14, %xmm3 + pcmpgtd %xmm5, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm1, %xmm12 + pandn %xmm14, %xmm3 + por %xmm3, %xmm12 + movdqa %xmm1, %xmm3 + pand %xmm14, %xmm1 + pandn %xmm5, %xmm3 + por %xmm3, %xmm1 + movdqa %xmm10, %xmm3 + psubq %xmm0, %xmm3 + pand %xmm3, %xmm9 + movdqa %xmm0, %xmm3 + pcmpgtd %xmm10, %xmm3 + por %xmm3, %xmm9 + pshufd $245, %xmm9, %xmm9 + movdqa %xmm9, %xmm3 + pand %xmm9, %xmm11 + pandn %xmm0, %xmm3 + por %xmm3, %xmm11 + movdqa %xmm9, %xmm3 + pand %xmm0, %xmm9 + pandn %xmm10, %xmm3 + movdqa %xmm6, %xmm0 + movdqa %xmm12, %xmm10 + por %xmm3, %xmm9 + movdqa %xmm15, %xmm3 + psubq %xmm15, %xmm0 + pcmpeqd %xmm6, %xmm3 + pand %xmm0, %xmm3 + movdqa %xmm15, %xmm0 + pcmpgtd %xmm6, %xmm0 + por %xmm0, %xmm3 + movdqa %xmm6, %xmm0 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm5 + pand %xmm3, %xmm0 + pandn %xmm15, %xmm5 + por %xmm5, %xmm0 + movdqa %xmm3, %xmm5 + pand %xmm15, %xmm3 + pandn %xmm6, %xmm5 + movdqa %xmm12, %xmm6 + por %xmm5, %xmm3 + movdqa %xmm8, %xmm5 + psubq %xmm8, %xmm6 + pcmpeqd %xmm12, %xmm5 + pand %xmm6, %xmm5 + movdqa %xmm8, %xmm6 + pcmpgtd %xmm12, %xmm6 + por %xmm6, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm6 + pand %xmm5, %xmm10 + pandn %xmm8, %xmm6 + pand %xmm5, %xmm8 + por %xmm6, %xmm10 + movdqa %xmm5, %xmm6 + movdqa %xmm1, %xmm5 + pandn %xmm12, %xmm6 + pcmpeqd %xmm2, %xmm5 + por %xmm6, %xmm8 + movdqa %xmm2, %xmm6 + psubq %xmm1, %xmm6 + pand %xmm6, %xmm5 + movdqa %xmm1, %xmm6 + pcmpgtd %xmm2, %xmm6 + por %xmm6, %xmm5 + movdqa %xmm2, %xmm6 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm12 + pand %xmm5, %xmm6 + pandn %xmm1, %xmm12 + pand %xmm5, %xmm1 + por %xmm12, %xmm6 + movdqa %xmm5, %xmm12 + movdqa %xmm11, %xmm5 + pandn %xmm2, %xmm12 + movdqa %xmm4, %xmm2 + psubq %xmm4, %xmm5 + pcmpeqd %xmm11, %xmm2 + por %xmm12, %xmm1 + movdqa %xmm11, %xmm12 + pand %xmm5, %xmm2 + movdqa %xmm4, %xmm5 + pcmpgtd %xmm11, %xmm5 + por %xmm5, %xmm2 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm5 + pand %xmm2, %xmm12 + pandn %xmm4, %xmm5 + pand %xmm2, %xmm4 + por %xmm12, %xmm5 + movdqa %xmm2, %xmm12 + movdqa %xmm0, %xmm2 + pandn %xmm11, %xmm12 + pcmpeqd %xmm9, %xmm2 + movdqa %xmm9, %xmm11 + por %xmm12, %xmm4 + movaps %xmm4, -64(%rbp) + movdqa %xmm9, %xmm4 + psubq %xmm0, %xmm4 + pand %xmm4, %xmm2 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm9, %xmm4 + por %xmm4, %xmm2 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm4 + pand %xmm2, %xmm11 + pandn %xmm0, %xmm4 + pand %xmm2, %xmm0 + por %xmm4, %xmm11 + movdqa %xmm2, %xmm4 + movdqa %xmm3, %xmm2 + pandn %xmm9, %xmm4 + pcmpeqd %xmm10, %xmm2 + movdqa %xmm10, %xmm9 + movaps %xmm11, -80(%rbp) + por %xmm4, %xmm0 + movdqa %xmm10, %xmm4 + psubq %xmm3, %xmm4 + pand %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pcmpgtd %xmm10, %xmm4 + por %xmm4, %xmm2 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm4 + pand %xmm2, %xmm9 + pandn %xmm3, %xmm4 + pand %xmm2, %xmm3 + por %xmm4, %xmm9 + movdqa %xmm2, %xmm4 + movdqa %xmm8, %xmm2 + pandn %xmm10, %xmm4 + psubq %xmm6, %xmm2 + por %xmm4, %xmm3 + movdqa %xmm6, %xmm4 + pcmpeqd %xmm8, %xmm4 + pand %xmm2, %xmm4 + movdqa %xmm6, %xmm2 + pcmpgtd %xmm8, %xmm2 + por %xmm2, %xmm4 + movdqa %xmm8, %xmm2 + pshufd $245, %xmm4, %xmm4 + movdqa %xmm4, %xmm10 + pand %xmm4, %xmm2 + pandn %xmm6, %xmm10 + pand %xmm4, %xmm6 + por %xmm10, %xmm2 + movdqa %xmm4, %xmm10 + movdqa %xmm7, %xmm4 + pandn %xmm8, %xmm10 + movdqa %xmm1, %xmm8 + psubq %xmm1, %xmm4 + pcmpeqd %xmm7, %xmm8 + por %xmm10, %xmm6 + pand %xmm4, %xmm8 + movdqa %xmm1, %xmm4 + pcmpgtd %xmm7, %xmm4 + por %xmm4, %xmm8 + movdqa %xmm7, %xmm4 + pshufd $245, %xmm8, %xmm8 + movdqa %xmm8, %xmm10 + pand %xmm8, %xmm4 + pandn %xmm1, %xmm10 + pand %xmm8, %xmm1 + por %xmm10, %xmm4 + movdqa %xmm8, %xmm10 + movdqa %xmm0, %xmm8 + pandn %xmm7, %xmm10 + movdqa %xmm9, %xmm7 + psubq %xmm9, %xmm8 + pcmpeqd %xmm0, %xmm7 + por %xmm10, %xmm1 + movdqa %xmm0, %xmm10 + pand %xmm8, %xmm7 + movdqa %xmm9, %xmm8 + pcmpgtd %xmm0, %xmm8 + por %xmm8, %xmm7 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm8 + pand %xmm7, %xmm10 + pandn %xmm9, %xmm8 + pand %xmm7, %xmm9 + por %xmm8, %xmm10 + movdqa %xmm7, %xmm8 + movdqa %xmm2, %xmm7 + pcmpeqd %xmm3, %xmm7 + pandn %xmm0, %xmm8 + movdqa %xmm3, %xmm0 + movaps %xmm10, -96(%rbp) + psubq %xmm2, %xmm0 + movdqa %xmm9, %xmm14 + por %xmm8, %xmm14 + pand %xmm0, %xmm7 + movdqa %xmm2, %xmm0 + pcmpgtd %xmm3, %xmm0 + por %xmm0, %xmm7 + movdqa %xmm3, %xmm0 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm8 + pand %xmm7, %xmm0 + pandn %xmm2, %xmm8 + pand %xmm7, %xmm2 + por %xmm8, %xmm0 + movdqa %xmm7, %xmm8 + pandn %xmm3, %xmm8 + por %xmm8, %xmm2 + jbe .L77 + movdqa -112(%rbp), %xmm7 + pshufd $78, %xmm4, %xmm4 + pshufd $78, %xmm6, %xmm6 + pshufd $78, -160(%rbp), %xmm10 + pshufd $78, %xmm1, %xmm11 + pshufd $78, %xmm13, %xmm8 + movdqa -128(%rbp), %xmm9 + pshufd $78, -176(%rbp), %xmm15 + movdqa %xmm7, %xmm3 + movdqa %xmm7, %xmm1 + movdqa %xmm7, %xmm13 + movaps %xmm11, -176(%rbp) + pcmpeqd %xmm10, %xmm3 + psubq %xmm10, %xmm1 + movdqa %xmm9, %xmm12 + pshufd $78, %xmm2, %xmm2 + pshufd $78, %xmm0, %xmm0 + pand %xmm1, %xmm3 + movdqa %xmm10, %xmm1 + pcmpgtd %xmm7, %xmm1 + por %xmm1, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm1 + pand %xmm3, %xmm13 + movaps %xmm3, -208(%rbp) + pandn %xmm10, %xmm1 + por %xmm1, %xmm13 + movdqa %xmm3, %xmm1 + movdqa -144(%rbp), %xmm3 + pandn %xmm7, %xmm1 + movdqa %xmm9, %xmm7 + pcmpeqd %xmm15, %xmm7 + movaps %xmm1, -256(%rbp) + movdqa %xmm9, %xmm1 + psubq %xmm15, %xmm1 + pand %xmm1, %xmm7 + movdqa %xmm15, %xmm1 + pcmpgtd %xmm9, %xmm1 + por %xmm1, %xmm7 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm1 + pand %xmm7, %xmm12 + pandn %xmm15, %xmm1 + por %xmm1, %xmm12 + movdqa %xmm7, %xmm1 + pand %xmm15, %xmm7 + pandn %xmm9, %xmm1 + movdqa %xmm8, %xmm9 + movaps %xmm1, -272(%rbp) + movdqa %xmm3, %xmm1 + psubq %xmm8, %xmm1 + pcmpeqd %xmm3, %xmm8 + movaps %xmm9, -192(%rbp) + pand %xmm1, %xmm8 + movdqa %xmm9, %xmm1 + movdqa %xmm3, %xmm9 + pcmpgtd %xmm3, %xmm1 + por %xmm1, %xmm8 + pshufd $245, %xmm8, %xmm8 + movdqa %xmm8, %xmm1 + pandn -192(%rbp), %xmm1 + pand %xmm8, %xmm9 + por %xmm1, %xmm9 + movdqa %xmm5, %xmm1 + movaps %xmm9, -128(%rbp) + movdqa %xmm8, %xmm9 + psubq %xmm11, %xmm1 + pand -192(%rbp), %xmm8 + pandn %xmm3, %xmm9 + movaps %xmm9, -288(%rbp) + movdqa %xmm1, %xmm9 + movdqa %xmm11, %xmm1 + por -288(%rbp), %xmm8 + pcmpeqd %xmm5, %xmm1 + pshufd $78, %xmm8, %xmm8 + pand %xmm9, %xmm1 + movdqa %xmm11, %xmm9 + movdqa %xmm5, %xmm11 + pcmpgtd %xmm5, %xmm9 + por %xmm9, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm9 + pand %xmm1, %xmm11 + pandn -176(%rbp), %xmm9 + por %xmm9, %xmm11 + movaps %xmm11, -144(%rbp) + movdqa %xmm1, %xmm11 + pand -176(%rbp), %xmm1 + pandn %xmm5, %xmm11 + movaps %xmm11, -304(%rbp) + movdqa -64(%rbp), %xmm11 + por -304(%rbp), %xmm1 + movdqa %xmm11, %xmm5 + movdqa %xmm11, %xmm9 + pshufd $78, %xmm1, %xmm1 + pcmpeqd %xmm4, %xmm5 + psubq %xmm4, %xmm9 + pand %xmm9, %xmm5 + movdqa %xmm4, %xmm9 + pcmpgtd %xmm11, %xmm9 + por %xmm9, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm9 + pandn %xmm4, %xmm9 + pand %xmm5, %xmm4 + movaps %xmm9, -320(%rbp) + movdqa %xmm5, %xmm9 + pand -64(%rbp), %xmm5 + por -320(%rbp), %xmm5 + pandn %xmm11, %xmm9 + movdqa -80(%rbp), %xmm11 + por %xmm9, %xmm4 + pshufd $78, %xmm5, %xmm5 + movaps %xmm4, -112(%rbp) + movdqa %xmm11, %xmm4 + movdqa %xmm11, %xmm9 + pcmpeqd %xmm6, %xmm4 + psubq %xmm6, %xmm9 + pand %xmm9, %xmm4 + movdqa %xmm6, %xmm9 + pcmpgtd %xmm11, %xmm9 + por %xmm9, %xmm4 + pshufd $245, %xmm4, %xmm4 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm9 + pandn %xmm11, %xmm3 + movdqa -96(%rbp), %xmm11 + pandn %xmm6, %xmm9 + pand %xmm4, %xmm6 + por %xmm3, %xmm6 + movaps %xmm9, -336(%rbp) + pand -80(%rbp), %xmm4 + por -336(%rbp), %xmm4 + movdqa %xmm11, %xmm3 + movaps %xmm6, -160(%rbp) + movdqa %xmm11, %xmm6 + pcmpeqd %xmm2, %xmm3 + psubq %xmm2, %xmm6 + pshufd $78, %xmm4, %xmm4 + movdqa %xmm3, %xmm9 + pand %xmm6, %xmm9 + movdqa %xmm2, %xmm6 + pcmpgtd %xmm11, %xmm6 + por %xmm6, %xmm9 + pshufd $245, %xmm9, %xmm9 + movdqa %xmm9, %xmm3 + pandn %xmm2, %xmm3 + pand %xmm9, %xmm2 + movaps %xmm3, -352(%rbp) + movdqa %xmm9, %xmm3 + pand -96(%rbp), %xmm9 + por -272(%rbp), %xmm7 + por -352(%rbp), %xmm9 + pandn %xmm11, %xmm3 + por %xmm3, %xmm2 + movdqa %xmm14, %xmm3 + pshufd $78, %xmm7, %xmm15 + psubq %xmm0, %xmm3 + pshufd $78, %xmm9, %xmm9 + movdqa %xmm3, %xmm11 + movdqa %xmm0, %xmm3 + movaps %xmm9, -80(%rbp) + pcmpeqd %xmm14, %xmm3 + movdqa %xmm3, %xmm6 + movdqa %xmm0, %xmm3 + pcmpgtd %xmm14, %xmm3 + pand %xmm11, %xmm6 + por %xmm3, %xmm6 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm3 + pandn %xmm0, %xmm3 + pand %xmm6, %xmm0 + movdqa %xmm3, %xmm11 + movdqa %xmm6, %xmm3 + pand %xmm14, %xmm6 + pandn %xmm14, %xmm3 + por %xmm11, %xmm6 + por %xmm3, %xmm0 + pshufd $78, %xmm6, %xmm6 + movdqa -208(%rbp), %xmm3 + movdqa %xmm6, %xmm7 + movdqa %xmm0, %xmm9 + pcmpeqd %xmm13, %xmm7 + pand %xmm10, %xmm3 + movdqa %xmm13, %xmm10 + por -256(%rbp), %xmm3 + pshufd $78, %xmm3, %xmm14 + movdqa %xmm13, %xmm3 + movdqa %xmm7, %xmm11 + psubq %xmm6, %xmm3 + movdqa %xmm14, %xmm7 + pand %xmm3, %xmm11 + pcmpeqd %xmm0, %xmm7 + movdqa %xmm6, %xmm3 + pcmpgtd %xmm13, %xmm3 + por %xmm3, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm3 + pand %xmm11, %xmm10 + pandn %xmm6, %xmm3 + por %xmm3, %xmm10 + movdqa %xmm11, %xmm3 + pand %xmm6, %xmm11 + pandn %xmm13, %xmm3 + movdqa -80(%rbp), %xmm13 + movaps %xmm3, -96(%rbp) + movdqa %xmm0, %xmm3 + psubq %xmm14, %xmm3 + pand %xmm3, %xmm7 + movdqa %xmm14, %xmm3 + pcmpgtd %xmm0, %xmm3 + por %xmm3, %xmm7 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm3 + pand %xmm7, %xmm9 + movaps %xmm7, -256(%rbp) + pandn %xmm14, %xmm3 + por %xmm3, %xmm9 + movdqa %xmm7, %xmm3 + movdqa -112(%rbp), %xmm7 + pandn %xmm0, %xmm3 + movdqa %xmm13, %xmm0 + pcmpeqd %xmm12, %xmm0 + movaps %xmm3, -272(%rbp) + movdqa %xmm12, %xmm3 + psubq %xmm13, %xmm3 + pand %xmm3, %xmm0 + movdqa %xmm13, %xmm3 + movdqa %xmm12, %xmm13 + pcmpgtd %xmm12, %xmm3 + por %xmm3, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + pandn -80(%rbp), %xmm3 + pand %xmm0, %xmm13 + por %xmm3, %xmm13 + movdqa %xmm0, %xmm3 + pand -80(%rbp), %xmm0 + pandn %xmm12, %xmm3 + movdqa %xmm2, %xmm12 + movaps %xmm13, -176(%rbp) + movdqa %xmm2, %xmm13 + movaps %xmm3, -288(%rbp) + movdqa %xmm15, %xmm3 + psubq %xmm15, %xmm12 + por -288(%rbp), %xmm0 + pcmpeqd %xmm2, %xmm3 + pshufd $78, %xmm0, %xmm0 + pand %xmm12, %xmm3 + movdqa %xmm15, %xmm12 + pcmpgtd %xmm2, %xmm12 + por %xmm12, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm12 + pand %xmm3, %xmm13 + pandn %xmm15, %xmm12 + por %xmm12, %xmm13 + movdqa %xmm3, %xmm12 + pand %xmm15, %xmm3 + movaps %xmm13, -192(%rbp) + movdqa -128(%rbp), %xmm13 + pandn %xmm2, %xmm12 + movaps %xmm12, -304(%rbp) + movdqa %xmm13, %xmm2 + psubq %xmm4, %xmm2 + movdqa %xmm2, %xmm12 + movdqa %xmm13, %xmm2 + pcmpeqd %xmm4, %xmm2 + pand %xmm12, %xmm2 + movdqa %xmm4, %xmm12 + pcmpgtd %xmm13, %xmm12 + por %xmm12, %xmm2 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm12 + pandn %xmm4, %xmm12 + pand %xmm2, %xmm4 + movaps %xmm12, -320(%rbp) + movdqa %xmm2, %xmm12 + pand -128(%rbp), %xmm2 + por -320(%rbp), %xmm2 + pandn %xmm13, %xmm12 + movdqa -160(%rbp), %xmm13 + por %xmm12, %xmm4 + pshufd $78, %xmm2, %xmm2 + movaps %xmm4, -208(%rbp) + movdqa %xmm13, %xmm4 + movdqa %xmm13, %xmm12 + pcmpeqd %xmm8, %xmm4 + psubq %xmm8, %xmm12 + pand %xmm12, %xmm4 + movdqa %xmm8, %xmm12 + pcmpgtd %xmm13, %xmm12 + por %xmm12, %xmm4 + pshufd $245, %xmm4, %xmm4 + movdqa %xmm4, %xmm12 + pandn %xmm8, %xmm12 + pand %xmm4, %xmm8 + movaps %xmm12, -336(%rbp) + movdqa %xmm4, %xmm12 + pandn %xmm13, %xmm12 + movdqa -144(%rbp), %xmm13 + por %xmm12, %xmm8 + movaps %xmm8, -64(%rbp) + movdqa %xmm13, %xmm8 + movdqa %xmm13, %xmm12 + pcmpeqd %xmm5, %xmm8 + psubq %xmm5, %xmm12 + pand %xmm12, %xmm8 + movdqa %xmm5, %xmm12 + pcmpgtd %xmm13, %xmm12 + por %xmm12, %xmm8 + pshufd $245, %xmm8, %xmm8 + movdqa %xmm8, %xmm12 + pandn %xmm5, %xmm12 + pand %xmm8, %xmm5 + movaps %xmm12, -352(%rbp) + movdqa %xmm8, %xmm12 + pand -144(%rbp), %xmm8 + por -352(%rbp), %xmm8 + pandn %xmm13, %xmm12 + movdqa %xmm7, %xmm13 + por %xmm12, %xmm5 + movdqa %xmm7, %xmm12 + psubq %xmm1, %xmm13 + pcmpeqd %xmm1, %xmm12 + pshufd $78, %xmm8, %xmm8 + pand %xmm13, %xmm12 + movdqa %xmm1, %xmm13 + pcmpgtd -112(%rbp), %xmm13 + por %xmm13, %xmm12 + pshufd $245, %xmm12, %xmm12 + movdqa %xmm12, %xmm13 + movdqa %xmm12, %xmm7 + pandn -112(%rbp), %xmm7 + pandn %xmm1, %xmm13 + pand %xmm12, %xmm1 + por -96(%rbp), %xmm11 + movdqa -176(%rbp), %xmm15 + por %xmm7, %xmm1 + movdqa -256(%rbp), %xmm7 + por -304(%rbp), %xmm3 + pshufd $78, %xmm11, %xmm6 + pand -160(%rbp), %xmm4 + por -336(%rbp), %xmm4 + pand %xmm14, %xmm7 + movaps %xmm6, -80(%rbp) + movdqa -112(%rbp), %xmm6 + por -272(%rbp), %xmm7 + movdqa %xmm10, %xmm14 + pshufd $78, %xmm4, %xmm4 + pshufd $78, %xmm3, %xmm3 + pshufd $78, %xmm7, %xmm7 + pand %xmm12, %xmm6 + movdqa -80(%rbp), %xmm12 + movaps %xmm7, -112(%rbp) + movdqa %xmm8, %xmm7 + por %xmm13, %xmm6 + pcmpeqd %xmm10, %xmm7 + pshufd $78, %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + psubq %xmm8, %xmm6 + movaps %xmm11, -96(%rbp) + movdqa %xmm15, %xmm11 + pcmpeqd %xmm2, %xmm11 + pand %xmm6, %xmm7 + movdqa %xmm8, %xmm6 + pcmpgtd %xmm10, %xmm6 + por %xmm6, %xmm7 + pshufd $245, %xmm7, %xmm7 + pand %xmm7, %xmm14 + movdqa %xmm7, %xmm6 + pandn %xmm8, %xmm6 + movdqa %xmm14, %xmm13 + por %xmm6, %xmm13 + movdqa %xmm7, %xmm6 + pand %xmm8, %xmm7 + pandn %xmm10, %xmm6 + movdqa %xmm5, %xmm10 + movaps %xmm6, -272(%rbp) + movdqa %xmm15, %xmm6 + psubq %xmm12, %xmm10 + psubq %xmm2, %xmm6 + pand %xmm6, %xmm11 + movdqa %xmm2, %xmm6 + pcmpgtd %xmm15, %xmm6 + por %xmm6, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm14 + movdqa %xmm11, %xmm6 + pandn %xmm2, %xmm14 + pandn %xmm15, %xmm6 + pand %xmm11, %xmm2 + por %xmm6, %xmm2 + movdqa %xmm12, %xmm6 + movdqa %xmm9, %xmm15 + movaps %xmm14, -288(%rbp) + pcmpeqd %xmm5, %xmm6 + movaps %xmm2, -144(%rbp) + movdqa -208(%rbp), %xmm14 + pand %xmm10, %xmm6 + movdqa %xmm12, %xmm10 + movdqa %xmm5, %xmm12 + pcmpgtd %xmm5, %xmm10 + por %xmm10, %xmm6 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm10 + pandn -80(%rbp), %xmm10 + pand %xmm6, %xmm12 + por %xmm10, %xmm12 + movdqa %xmm6, %xmm10 + pandn %xmm5, %xmm10 + movdqa %xmm14, %xmm5 + movaps %xmm10, -304(%rbp) + movdqa %xmm14, %xmm10 + psubq %xmm0, %xmm5 + pcmpeqd %xmm0, %xmm10 + pand %xmm5, %xmm10 + movdqa %xmm0, %xmm5 + pcmpgtd %xmm14, %xmm5 + por %xmm5, %xmm10 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm5 + pandn %xmm0, %xmm5 + pand %xmm10, %xmm0 + movaps %xmm5, -320(%rbp) + movdqa %xmm10, %xmm5 + pandn %xmm14, %xmm5 + por %xmm5, %xmm0 + movdqa -96(%rbp), %xmm5 + psubq %xmm5, %xmm15 + movdqa %xmm15, %xmm14 + movdqa %xmm5, %xmm15 + pcmpeqd %xmm9, %xmm5 + pand %xmm14, %xmm5 + movdqa %xmm15, %xmm14 + movdqa %xmm9, %xmm15 + pcmpgtd %xmm9, %xmm14 + por %xmm14, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm14 + pandn -96(%rbp), %xmm14 + pand %xmm5, %xmm15 + por %xmm14, %xmm15 + movaps %xmm15, -160(%rbp) + movdqa %xmm5, %xmm15 + pandn %xmm9, %xmm15 + movaps %xmm15, -336(%rbp) + movdqa -192(%rbp), %xmm15 + movdqa %xmm15, %xmm9 + movdqa %xmm15, %xmm14 + pcmpeqd %xmm4, %xmm9 + psubq %xmm4, %xmm14 + pand %xmm14, %xmm9 + movdqa %xmm4, %xmm14 + pcmpgtd %xmm15, %xmm14 + por %xmm14, %xmm9 + pshufd $245, %xmm9, %xmm9 + movdqa %xmm9, %xmm14 + movdqa %xmm9, %xmm2 + pandn %xmm4, %xmm14 + pandn %xmm15, %xmm2 + pand %xmm9, %xmm4 + movdqa -112(%rbp), %xmm15 + por %xmm2, %xmm4 + movaps %xmm14, -352(%rbp) + movdqa %xmm1, %xmm2 + movaps %xmm4, -128(%rbp) + movdqa %xmm15, %xmm4 + movdqa %xmm15, %xmm14 + psubq %xmm15, %xmm2 + pcmpeqd %xmm1, %xmm4 + pcmpgtd %xmm1, %xmm14 + pand %xmm2, %xmm4 + por %xmm14, %xmm4 + pshufd $245, %xmm4, %xmm4 + movdqa %xmm4, %xmm2 + pandn -112(%rbp), %xmm2 + movdqa %xmm2, %xmm14 + movdqa %xmm1, %xmm2 + pand %xmm4, %xmm2 + movdqa %xmm2, %xmm15 + movdqa %xmm4, %xmm2 + pandn %xmm1, %xmm2 + por %xmm14, %xmm15 + movaps %xmm2, -368(%rbp) + movdqa -64(%rbp), %xmm2 + movdqa %xmm2, %xmm14 + movdqa %xmm2, %xmm1 + pcmpeqd %xmm3, %xmm14 + psubq %xmm3, %xmm1 + pand %xmm1, %xmm14 + movdqa %xmm3, %xmm1 + pcmpgtd -64(%rbp), %xmm1 + por %xmm1, %xmm14 + pshufd $245, %xmm14, %xmm14 + movdqa %xmm14, %xmm1 + movdqa %xmm14, %xmm2 + pandn %xmm3, %xmm1 + pand %xmm14, %xmm3 + movaps %xmm1, -384(%rbp) + pandn -64(%rbp), %xmm2 + pand -176(%rbp), %xmm11 + por -288(%rbp), %xmm11 + pand -96(%rbp), %xmm5 + por %xmm2, %xmm3 + pand -80(%rbp), %xmm6 + por -336(%rbp), %xmm5 + pshufd $78, %xmm11, %xmm11 + movaps %xmm3, -256(%rbp) + movdqa -64(%rbp), %xmm8 + por -272(%rbp), %xmm7 + movdqa %xmm11, %xmm2 + pshufd $78, %xmm5, %xmm1 + movdqa -144(%rbp), %xmm5 + pand -208(%rbp), %xmm10 + pcmpeqd %xmm13, %xmm2 + movaps %xmm1, -80(%rbp) + movdqa %xmm13, %xmm1 + pand %xmm14, %xmm8 + psubq %xmm11, %xmm1 + movdqa %xmm13, %xmm14 + pshufd $78, %xmm7, %xmm7 + por -320(%rbp), %xmm10 + pand -112(%rbp), %xmm4 + movdqa %xmm2, %xmm3 + movdqa %xmm5, %xmm2 + por -304(%rbp), %xmm6 + pand %xmm1, %xmm3 + movdqa %xmm11, %xmm1 + psubq %xmm7, %xmm2 + pcmpgtd %xmm13, %xmm1 + pshufd $78, %xmm10, %xmm10 + pshufd $78, %xmm6, %xmm6 + pand -192(%rbp), %xmm9 + por -368(%rbp), %xmm4 + por -352(%rbp), %xmm9 + por -384(%rbp), %xmm8 + por %xmm1, %xmm3 + pshufd $78, %xmm4, %xmm4 + pshufd $245, %xmm3, %xmm3 + pshufd $78, %xmm9, %xmm9 + pshufd $78, %xmm8, %xmm8 + movdqa %xmm3, %xmm1 + pand %xmm3, %xmm14 + pandn %xmm11, %xmm1 + por %xmm1, %xmm14 + movdqa %xmm3, %xmm1 + pand %xmm11, %xmm3 + pandn %xmm13, %xmm1 + movdqa %xmm7, %xmm11 + por %xmm1, %xmm3 + pcmpgtd %xmm5, %xmm11 + movdqa %xmm7, %xmm1 + pcmpeqd %xmm5, %xmm1 + pand %xmm2, %xmm1 + movdqa %xmm5, %xmm2 + por %xmm11, %xmm1 + pshufd $245, %xmm1, %xmm1 + pand %xmm1, %xmm2 + movdqa %xmm1, %xmm13 + pandn %xmm7, %xmm13 + movdqa %xmm2, %xmm11 + movdqa %xmm12, %xmm2 + por %xmm13, %xmm11 + movdqa %xmm1, %xmm13 + pand %xmm7, %xmm1 + pandn %xmm5, %xmm13 + psubq %xmm10, %xmm2 + movdqa -160(%rbp), %xmm5 + por %xmm13, %xmm1 + movdqa %xmm10, %xmm13 + pcmpeqd %xmm12, %xmm13 + movdqa %xmm13, %xmm7 + movdqa %xmm12, %xmm13 + pand %xmm2, %xmm7 + movdqa %xmm10, %xmm2 + pcmpgtd %xmm12, %xmm2 + por %xmm2, %xmm7 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm2 + pand %xmm7, %xmm13 + pandn %xmm10, %xmm2 + por %xmm13, %xmm2 + movdqa %xmm7, %xmm13 + pand %xmm10, %xmm7 + pandn %xmm12, %xmm13 + movdqa %xmm6, %xmm12 + movdqa %xmm0, %xmm10 + pcmpeqd %xmm0, %xmm12 + por %xmm7, %xmm13 + psubq %xmm6, %xmm10 + movdqa %xmm12, %xmm7 + movdqa %xmm0, %xmm12 + pand %xmm10, %xmm7 + movdqa %xmm6, %xmm10 + pcmpgtd %xmm0, %xmm10 + por %xmm10, %xmm7 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm10 + pand %xmm7, %xmm12 + pandn %xmm6, %xmm10 + por %xmm10, %xmm12 + movdqa %xmm7, %xmm10 + pand %xmm6, %xmm7 + pandn %xmm0, %xmm10 + movdqa %xmm5, %xmm0 + movdqa %xmm5, %xmm6 + pcmpeqd %xmm9, %xmm0 + psubq %xmm9, %xmm6 + por %xmm7, %xmm10 + movdqa %xmm5, %xmm7 + pand %xmm6, %xmm0 + movdqa %xmm9, %xmm6 + pcmpgtd %xmm5, %xmm6 + por %xmm6, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm6 + pand %xmm0, %xmm7 + pandn %xmm9, %xmm6 + por %xmm6, %xmm7 + movdqa %xmm0, %xmm6 + pand %xmm9, %xmm0 + movaps %xmm7, -160(%rbp) + movdqa -128(%rbp), %xmm7 + pandn %xmm5, %xmm6 + movdqa -80(%rbp), %xmm5 + movdqa %xmm0, %xmm9 + movdqa %xmm7, %xmm0 + por %xmm6, %xmm9 + movdqa %xmm7, %xmm6 + pcmpeqd %xmm5, %xmm0 + psubq %xmm5, %xmm6 + pand %xmm6, %xmm0 + movdqa %xmm5, %xmm6 + pcmpgtd %xmm7, %xmm6 + por %xmm6, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm6 + pandn -80(%rbp), %xmm6 + pand %xmm0, %xmm7 + movdqa %xmm0, %xmm5 + pand -80(%rbp), %xmm0 + pandn -128(%rbp), %xmm5 + por %xmm6, %xmm7 + movdqa %xmm8, %xmm6 + pcmpeqd %xmm15, %xmm6 + por %xmm5, %xmm0 + movdqa %xmm15, %xmm5 + psubq %xmm8, %xmm5 + movaps %xmm0, -176(%rbp) + movdqa %xmm6, %xmm0 + movdqa %xmm15, %xmm6 + pand %xmm5, %xmm0 + movdqa %xmm8, %xmm5 + pcmpgtd %xmm15, %xmm5 + por %xmm5, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm5 + pand %xmm0, %xmm6 + pandn %xmm8, %xmm5 + por %xmm5, %xmm6 + movdqa %xmm0, %xmm5 + pand %xmm8, %xmm0 + pandn %xmm15, %xmm5 + movdqa %xmm0, %xmm8 + movdqa -256(%rbp), %xmm15 + movaps %xmm6, -192(%rbp) + por %xmm5, %xmm8 + movdqa %xmm15, %xmm0 + movdqa %xmm15, %xmm5 + movdqa %xmm15, %xmm6 + pcmpeqd %xmm4, %xmm0 + psubq %xmm4, %xmm5 + pand %xmm5, %xmm0 + movdqa %xmm4, %xmm5 + pcmpgtd %xmm15, %xmm5 + por %xmm5, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm5 + pand %xmm0, %xmm6 + pandn %xmm4, %xmm5 + por %xmm5, %xmm6 + movdqa %xmm0, %xmm5 + pand %xmm4, %xmm0 + pandn %xmm15, %xmm5 + pshufd $78, %xmm14, %xmm4 + movdqa %xmm0, %xmm15 + movaps %xmm6, -208(%rbp) + movdqa %xmm14, %xmm0 + por %xmm5, %xmm15 + movdqa %xmm14, %xmm5 + pcmpeqd %xmm4, %xmm0 + psubq %xmm4, %xmm5 + pand %xmm5, %xmm0 + movdqa %xmm4, %xmm5 + pcmpgtd %xmm14, %xmm5 + por %xmm5, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm5 + pandn %xmm4, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm0, %xmm5 + pandn %xmm14, %xmm5 + pand %xmm0, %xmm14 + pand %xmm4, %xmm0 + por %xmm5, %xmm0 + pshufd $78, %xmm3, %xmm4 + por %xmm6, %xmm14 + movapd %xmm0, %xmm5 + movdqa %xmm3, %xmm0 + movsd %xmm14, %xmm5 + pcmpeqd %xmm4, %xmm0 + movaps %xmm5, -112(%rbp) + movdqa %xmm3, %xmm5 + psubq %xmm4, %xmm5 + pand %xmm5, %xmm0 + movdqa %xmm4, %xmm5 + pcmpgtd %xmm3, %xmm5 + por %xmm5, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm5 + pandn %xmm4, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm0, %xmm5 + pandn %xmm3, %xmm5 + pand %xmm0, %xmm3 + pand %xmm4, %xmm0 + por %xmm5, %xmm0 + por %xmm6, %xmm3 + movapd %xmm0, %xmm5 + movsd %xmm3, %xmm5 + pshufd $78, %xmm11, %xmm3 + movaps %xmm5, -128(%rbp) + movdqa %xmm11, %xmm5 + psubq %xmm3, %xmm5 + movdqa %xmm5, %xmm4 + movdqa %xmm11, %xmm5 + pcmpeqd %xmm3, %xmm5 + movdqa %xmm5, %xmm0 + movdqa %xmm3, %xmm5 + pcmpgtd %xmm11, %xmm5 + pand %xmm4, %xmm0 + por %xmm5, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm14 + movdqa %xmm0, %xmm5 + pandn %xmm11, %xmm14 + pand %xmm0, %xmm11 + pand %xmm3, %xmm0 + pandn %xmm3, %xmm5 + por %xmm14, %xmm0 + pshufd $78, %xmm1, %xmm3 + por %xmm5, %xmm11 + movapd %xmm0, %xmm5 + movsd %xmm11, %xmm5 + movaps %xmm5, -144(%rbp) + movdqa %xmm1, %xmm5 + psubq %xmm3, %xmm5 + movdqa %xmm5, %xmm4 + movdqa %xmm1, %xmm5 + pcmpeqd %xmm3, %xmm5 + movdqa %xmm5, %xmm0 + movdqa %xmm3, %xmm5 + pcmpgtd %xmm1, %xmm5 + pand %xmm4, %xmm0 + por %xmm5, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm14 + movdqa %xmm0, %xmm5 + pandn %xmm1, %xmm14 + pand %xmm0, %xmm1 + pand %xmm3, %xmm0 + pandn %xmm3, %xmm5 + por %xmm14, %xmm0 + movdqa %xmm2, %xmm3 + por %xmm5, %xmm1 + movapd %xmm0, %xmm5 + movdqa %xmm2, %xmm0 + movsd %xmm1, %xmm5 + pshufd $78, %xmm2, %xmm1 + pcmpeqd %xmm1, %xmm0 + psubq %xmm1, %xmm3 + pand %xmm3, %xmm0 + movdqa %xmm1, %xmm3 + pcmpgtd %xmm2, %xmm3 + por %xmm3, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + pandn %xmm1, %xmm3 + movdqa %xmm3, %xmm4 + movdqa %xmm0, %xmm3 + pandn %xmm2, %xmm3 + pand %xmm0, %xmm2 + pand %xmm1, %xmm0 + por %xmm3, %xmm0 + por %xmm4, %xmm2 + movdqa %xmm13, %xmm3 + movdqa -160(%rbp), %xmm4 + movapd %xmm0, %xmm1 + movsd %xmm2, %xmm1 + movdqa %xmm13, %xmm2 + movaps %xmm1, -64(%rbp) + pshufd $78, %xmm13, %xmm1 + pcmpeqd %xmm1, %xmm3 + psubq %xmm1, %xmm2 + movdqa %xmm3, %xmm0 + pand %xmm2, %xmm0 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm13, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + pandn %xmm1, %xmm3 + pandn %xmm13, %xmm2 + pand %xmm0, %xmm13 + pand %xmm1, %xmm0 + por %xmm3, %xmm13 + pshufd $78, %xmm12, %xmm1 + por %xmm2, %xmm0 + movdqa %xmm12, %xmm3 + pcmpeqd %xmm1, %xmm3 + movapd %xmm0, %xmm2 + movsd %xmm13, %xmm2 + movdqa -176(%rbp), %xmm13 + movaps %xmm2, -80(%rbp) + movdqa %xmm12, %xmm2 + psubq %xmm1, %xmm2 + movdqa %xmm3, %xmm0 + pand %xmm2, %xmm0 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm12, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + pandn %xmm1, %xmm3 + pandn %xmm12, %xmm2 + pand %xmm0, %xmm12 + pand %xmm1, %xmm0 + por %xmm3, %xmm12 + pshufd $78, %xmm10, %xmm1 + por %xmm2, %xmm0 + movdqa %xmm10, %xmm3 + pcmpeqd %xmm1, %xmm3 + movapd %xmm0, %xmm2 + movsd %xmm12, %xmm2 + movaps %xmm2, -96(%rbp) + movdqa %xmm10, %xmm2 + psubq %xmm1, %xmm2 + movdqa %xmm3, %xmm0 + pand %xmm2, %xmm0 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm10, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm10, %xmm2 + pand %xmm0, %xmm10 + pand %xmm1, %xmm0 + pandn %xmm1, %xmm3 + por %xmm2, %xmm0 + pshufd $78, %xmm4, %xmm1 + por %xmm3, %xmm10 + movapd %xmm0, %xmm2 + movdqa %xmm4, %xmm0 + movsd %xmm10, %xmm2 + pcmpeqd %xmm1, %xmm0 + movapd %xmm2, %xmm14 + movdqa %xmm4, %xmm2 + psubq %xmm1, %xmm2 + pand %xmm2, %xmm0 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm4, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pandn %xmm4, %xmm2 + pand %xmm0, %xmm4 + pand %xmm1, %xmm0 + pandn %xmm1, %xmm3 + por %xmm2, %xmm0 + pshufd $78, %xmm9, %xmm1 + movdqa %xmm9, %xmm2 + por %xmm3, %xmm4 + movdqa %xmm9, %xmm3 + pcmpeqd %xmm1, %xmm2 + psubq %xmm1, %xmm3 + movsd %xmm4, %xmm0 + pand %xmm3, %xmm2 + movdqa %xmm1, %xmm3 + pcmpgtd %xmm9, %xmm3 + por %xmm3, %xmm2 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm3 + pandn %xmm1, %xmm4 + pandn %xmm9, %xmm3 + pand %xmm2, %xmm9 + por %xmm4, %xmm9 + pand %xmm1, %xmm2 + movdqa %xmm7, %xmm4 + pshufd $78, %xmm7, %xmm1 + por %xmm3, %xmm2 + movdqa %xmm7, %xmm3 + pcmpeqd %xmm1, %xmm4 + psubq %xmm1, %xmm3 + movsd %xmm9, %xmm2 + movdqa %xmm4, %xmm6 + pand %xmm3, %xmm6 + movdqa %xmm1, %xmm3 + pcmpgtd %xmm7, %xmm3 + por %xmm3, %xmm6 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm4 + movdqa %xmm6, %xmm3 + pandn %xmm1, %xmm4 + pandn %xmm7, %xmm3 + pand %xmm6, %xmm7 + por %xmm4, %xmm7 + pand %xmm1, %xmm6 + movdqa %xmm13, %xmm4 + pshufd $78, %xmm13, %xmm1 + por %xmm3, %xmm6 + movdqa %xmm13, %xmm3 + pcmpeqd %xmm1, %xmm4 + psubq %xmm1, %xmm3 + movsd %xmm7, %xmm6 + pand %xmm3, %xmm4 + movdqa %xmm1, %xmm3 + pcmpgtd %xmm13, %xmm3 + por %xmm3, %xmm4 + pshufd $245, %xmm4, %xmm4 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm7 + pandn %xmm13, %xmm3 + pand %xmm4, %xmm13 + pand %xmm1, %xmm4 + movdqa %xmm13, %xmm9 + pandn %xmm1, %xmm7 + por %xmm3, %xmm4 + movdqa -192(%rbp), %xmm13 + por %xmm7, %xmm9 + pshufd $78, %xmm13, %xmm3 + movdqa %xmm13, %xmm1 + movdqa %xmm13, %xmm7 + pcmpeqd %xmm3, %xmm1 + psubq %xmm3, %xmm7 + movsd %xmm9, %xmm4 + pand %xmm7, %xmm1 + movdqa %xmm3, %xmm7 + pcmpgtd %xmm13, %xmm7 + por %xmm7, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm9 + pandn %xmm13, %xmm7 + pand %xmm1, %xmm13 + pand %xmm3, %xmm1 + pandn %xmm3, %xmm9 + por %xmm7, %xmm1 + movdqa %xmm8, %xmm3 + pshufd $78, %xmm8, %xmm7 + movdqa %xmm13, %xmm10 + movdqa -208(%rbp), %xmm13 + pcmpeqd %xmm7, %xmm3 + por %xmm9, %xmm10 + movdqa %xmm8, %xmm9 + psubq %xmm7, %xmm9 + movsd %xmm10, %xmm1 + pand %xmm9, %xmm3 + movdqa %xmm7, %xmm9 + pcmpgtd %xmm8, %xmm9 + por %xmm9, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm10 + movdqa %xmm3, %xmm9 + pandn %xmm7, %xmm10 + pandn %xmm8, %xmm9 + pand %xmm3, %xmm8 + pand %xmm7, %xmm3 + por %xmm10, %xmm8 + pshufd $78, %xmm13, %xmm7 + por %xmm9, %xmm3 + movdqa %xmm13, %xmm9 + shufpd $2, %xmm3, %xmm8 + movdqa %xmm13, %xmm3 + psubq %xmm7, %xmm9 + pcmpeqd %xmm7, %xmm3 + pand %xmm9, %xmm3 + movdqa %xmm7, %xmm9 + pcmpgtd %xmm13, %xmm9 + por %xmm9, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm9 + movdqa %xmm3, %xmm10 + pandn %xmm13, %xmm9 + pand %xmm3, %xmm13 + pand %xmm7, %xmm3 + por %xmm9, %xmm3 + pandn %xmm7, %xmm10 + pshufd $78, %xmm15, %xmm9 + movapd %xmm3, %xmm7 + movdqa %xmm15, %xmm3 + movdqa %xmm13, %xmm11 + pcmpeqd %xmm9, %xmm3 + por %xmm10, %xmm11 + movdqa %xmm15, %xmm10 + psubq %xmm9, %xmm10 + movsd %xmm11, %xmm7 + pand %xmm10, %xmm3 + movdqa %xmm9, %xmm10 + pcmpgtd %xmm15, %xmm10 + por %xmm10, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm11 + movdqa %xmm3, %xmm10 + pandn %xmm15, %xmm10 + pandn %xmm9, %xmm11 + pand %xmm3, %xmm15 + pand %xmm9, %xmm3 + por %xmm11, %xmm15 + por %xmm10, %xmm3 + movsd %xmm15, %xmm3 +.L75: + movdqa -112(%rbp), %xmm15 + movq -232(%rbp), %rdx + movups %xmm15, (%rdx) + movdqa -128(%rbp), %xmm15 + movups %xmm15, (%r15) + movdqa -144(%rbp), %xmm15 + movups %xmm15, (%r14) + movups %xmm5, 0(%r13) + movdqa -64(%rbp), %xmm5 + movups %xmm5, (%r12) + movdqa -80(%rbp), %xmm5 + movups %xmm5, (%rbx) + movdqa -96(%rbp), %xmm5 + movq -224(%rbp), %rbx + movups %xmm5, (%r11) + movups %xmm14, (%r10) + movups %xmm0, (%r9) + movups %xmm2, (%r8) + movups %xmm6, (%rdi) + movups %xmm4, (%rsi) + movups %xmm1, (%rcx) + movq -216(%rbp), %rcx + movups %xmm8, (%rcx) + movups %xmm7, (%rbx) + movups %xmm3, (%rax) + addq $224, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L77: + .cfi_restore_state + movdqa -160(%rbp), %xmm3 + movdqa -176(%rbp), %xmm7 + movdqa %xmm13, %xmm8 + jmp .L75 + .cfi_endproc +.LFE18782: + .size _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0, .-_ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + .section .text._ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, @function +_ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0: +.LFB18783: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsi, %r10 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %rbx + .cfi_offset 3, -24 + cmpq $7, %rsi + jbe .L94 + movl $8, %r8d + xorl %esi, %esi + jmp .L85 + .p2align 4,,10 + .p2align 3 +.L80: + vmovdqu64 %zmm0, (%rax) + kmovb %k0, %eax + popcntq %rax, %rax + addq %rax, %rsi + leaq 8(%r8), %rax + cmpq %r10, %rax + ja .L105 + movq %rax, %r8 +.L85: + vmovdqu64 -64(%rdi,%r8,8), %zmm3 + leaq -8(%r8), %r9 + leaq (%rdi,%rsi,8), %rax + vpcmpq $0, %zmm0, %zmm3, %k0 + vpcmpq $0, %zmm1, %zmm3, %k1 + kmovb %k0, %r11d + kmovb %k1, %ebx + korb %k1, %k0, %k1 + kortestb %k1, %k1 + jc .L80 + kmovb %r11d, %k6 + kmovb %ebx, %k5 + kxnorb %k5, %k6, %k7 + kmovb %k7, %eax + tzcntl %eax, %eax + addq %r9, %rax + vpbroadcastq (%rdi,%rax,8), %zmm0 + leaq 8(%rsi), %rax + vmovdqa64 %zmm0, (%rdx) + cmpq %r9, %rax + ja .L81 + .p2align 4,,10 + .p2align 3 +.L82: + vmovdqu64 %zmm1, -64(%rdi,%rax,8) + movq %rax, %rsi + addq $8, %rax + cmpq %rax, %r9 + jnb .L82 +.L81: + subq %rsi, %r9 + leaq (%rdi,%rsi,8), %rdx + movl $255, %eax + cmpq $255, %r9 + jbe .L106 +.L83: + kmovb %eax, %k4 + xorl %eax, %eax + vmovdqu64 %zmm1, (%rdx){%k4} +.L78: + movq -8(%rbp), %rbx + leave + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L106: + .cfi_restore_state + movq $-1, %rax + bzhi %r9, %rax, %rax + movzbl %al, %eax + jmp .L83 + .p2align 4,,10 + .p2align 3 +.L105: + movq %r10, %r11 + leaq (%rdi,%r8,8), %rbx + leaq (%rdi,%rsi,8), %r9 + movl $255, %eax + subq %r8, %r11 + kmovd %eax, %k1 + cmpq $255, %r11 + jbe .L79 +.L86: + vmovdqu64 (%rbx), %zmm2{%k1}{z} + knotb %k1, %k3 + vmovdqu64 %zmm2, (%rcx){%k1} + vmovdqa64 (%rcx), %zmm2 + vpcmpq $0, %zmm0, %zmm2, %k0 + vpcmpq $0, %zmm1, %zmm2, %k2 + kandb %k1, %k0, %k0 + korb %k2, %k0, %k2 + korb %k3, %k2, %k2 + kortestb %k2, %k2 + jnc .L107 + kmovb %k0, %edx + popcntq %rdx, %rdx + addq %rsi, %rdx + vmovdqu64 %zmm0, (%r9){%k1} + leaq 8(%rdx), %rax + cmpq %r10, %rax + ja .L91 + .p2align 4,,10 + .p2align 3 +.L92: + vmovdqu64 %zmm1, -64(%rdi,%rax,8) + movq %rax, %rdx + addq $8, %rax + cmpq %rax, %r10 + jnb .L92 +.L91: + subq %rdx, %r10 + leaq (%rdi,%rdx,8), %rcx + movl $255, %eax + cmpq $255, %r10 + ja .L93 + movq $-1, %rax + bzhi %r10, %rax, %rax + movzbl %al, %eax +.L93: + kmovb %eax, %k5 + movl $1, %eax + vmovdqu64 %zmm1, (%rcx){%k5} + movq -8(%rbp), %rbx + leave + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret +.L94: + .cfi_restore_state + movq %rsi, %r11 + movq %rdi, %r9 + movq %rdi, %rbx + xorl %r8d, %r8d + xorl %esi, %esi +.L79: + movq $-1, %rax + bzhi %r11, %rax, %rax + movzbl %al, %eax + kmovd %eax, %k1 + jmp .L86 +.L107: + knotb %k2, %k3 + kmovb %k3, %eax + tzcntl %eax, %eax + addq %r8, %rax + vpbroadcastq (%rdi,%rax,8), %zmm0 + leaq 8(%rsi), %rax + vmovdqa64 %zmm0, (%rdx) + cmpq %r8, %rax + ja .L88 + .p2align 4,,10 + .p2align 3 +.L89: + vmovdqu64 %zmm1, -64(%rdi,%rax,8) + movq %rax, %rsi + leaq 8(%rax), %rax + cmpq %rax, %r8 + jnb .L89 + leaq (%rdi,%rsi,8), %r9 +.L88: + subq %rsi, %r8 + movl $255, %eax + cmpq $255, %r8 + ja .L90 + movq $-1, %rax + bzhi %r8, %rax, %rax + movzbl %al, %eax +.L90: + kmovb %eax, %k6 + xorl %eax, %eax + vmovdqu64 %zmm1, (%r9){%k6} + jmp .L78 + .cfi_endproc +.LFE18783: + .size _ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, .-_ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + .section .text._ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0, @function +_ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0: +.LFB18784: + .cfi_startproc + movq %rsi, %r8 + movq %rdx, %rcx + cmpq %rdx, %rsi + jbe .L118 + leaq (%rdx,%rdx), %rdx + leaq 1(%rcx), %r9 + leaq 1(%rdx), %rsi + addq $2, %rdx + cmpq %rsi, %r8 + jbe .L118 + movq (%rdi,%rcx,8), %r11 + vpbroadcastq %r11, %xmm1 + jmp .L111 + .p2align 4,,10 + .p2align 3 +.L121: + movq %rsi, %rax + cmpq %rdx, %r8 + ja .L119 +.L113: + cmpq %rcx, %rax + je .L118 + leaq (%rdi,%rax,8), %rdx + movq (%rdx), %rcx + movq %rcx, (%r10) + movq %r11, (%rdx) + cmpq %rax, %r8 + jbe .L120 + leaq (%rax,%rax), %rdx + leaq 1(%rax), %r9 + leaq 1(%rdx), %rsi + addq $2, %rdx + cmpq %r8, %rsi + jnb .L118 + movq %rax, %rcx +.L111: + vpbroadcastq (%rdi,%rsi,8), %xmm0 + leaq (%rdi,%rcx,8), %r10 + vpcmpq $6, %xmm1, %xmm0, %k0 + kmovb %k0, %eax + testb $1, %al + jne .L121 + cmpq %rdx, %r8 + jbe .L118 + salq $4, %r9 + vpbroadcastq (%rdi,%r9), %xmm0 + vpcmpq $6, %xmm1, %xmm0, %k1 + kmovb %k1, %eax + testb $1, %al + je .L118 + movq %rdx, %rax + jmp .L113 + .p2align 4,,10 + .p2align 3 +.L118: + ret + .p2align 4,,10 + .p2align 3 +.L119: + salq $4, %r9 + vpbroadcastq (%rdi,%r9), %xmm2 + vpcmpq $6, %xmm0, %xmm2, %k2 + kmovb %k2, %esi + andl $1, %esi + cmovne %rdx, %rax + jmp .L113 + .p2align 4,,10 + .p2align 3 +.L120: + ret + .cfi_endproc +.LFE18784: + .size _ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0, .-_ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + .section .text._ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0, @function +_ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0: +.LFB18785: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + leaq 0(,%rsi,8), %r15 + leaq (%rdi,%r15), %rdx + pushq %r14 + .cfi_offset 14, -32 + leaq (%rdx,%r15), %r14 + pushq %r13 + vmovq %rdx, %xmm18 + .cfi_offset 13, -40 + leaq (%r14,%r15), %r13 + pushq %r12 + .cfi_offset 12, -48 + leaq 0(%r13,%r15), %r12 + pushq %rbx + .cfi_offset 3, -56 + leaq (%r12,%r15), %rbx + leaq (%rbx,%r15), %r11 + leaq (%r11,%r15), %r10 + andq $-64, %rsp + leaq (%r10,%r15), %r9 + movq %rsi, -8(%rsp) + leaq (%r9,%r15), %r8 + vmovdqu64 (%rdi), %zmm4 + vmovdqu64 (%r9), %zmm5 + leaq (%r8,%r15), %rdx + vpminsq (%r8), %zmm5, %zmm8 + vmovdqu64 (%r11), %zmm1 + leaq (%rdx,%r15), %rsi + vmovq %rdx, %xmm23 + vpmaxsq (%r8), %zmm5, %zmm6 + leaq (%rsi,%r15), %rcx + leaq (%rcx,%r15), %rdx + leaq (%rdx,%r15), %rax + addq %rax, %r15 + movq %r15, -16(%rsp) + vmovq %xmm18, %r15 + vpminsq (%r15), %zmm4, %zmm13 + vpmaxsq (%r15), %zmm4, %zmm7 + vmovq %xmm23, %r15 + vmovdqu64 (%r14), %zmm4 + vmovdqu64 (%r15), %zmm5 + vpminsq 0(%r13), %zmm4, %zmm14 + vpmaxsq 0(%r13), %zmm4, %zmm3 + vpminsq (%rsi), %zmm5, %zmm9 + vpmaxsq (%rsi), %zmm5, %zmm15 + vmovdqu64 (%r12), %zmm4 + vmovdqu64 (%rcx), %zmm5 + vpminsq %zmm14, %zmm13, %zmm10 + vpmaxsq %zmm14, %zmm13, %zmm13 + vpminsq (%rbx), %zmm4, %zmm12 + vpmaxsq (%rbx), %zmm4, %zmm2 + vpminsq (%rdx), %zmm5, %zmm0 + vpminsq (%r10), %zmm1, %zmm4 + vpmaxsq (%rdx), %zmm5, %zmm16 + vpmaxsq (%r10), %zmm1, %zmm1 + vmovdqu64 (%rax), %zmm5 + movq -16(%rsp), %r15 + vpminsq %zmm3, %zmm7, %zmm14 + vpmaxsq %zmm3, %zmm7, %zmm7 + vpminsq %zmm4, %zmm12, %zmm3 + vpmaxsq %zmm4, %zmm12, %zmm12 + cmpq $1, -8(%rsp) + vpminsq (%r15), %zmm5, %zmm11 + vpmaxsq (%r15), %zmm5, %zmm5 + vpminsq %zmm1, %zmm2, %zmm4 + vpmaxsq %zmm1, %zmm2, %zmm2 + vpminsq %zmm9, %zmm8, %zmm1 + vpmaxsq %zmm9, %zmm8, %zmm8 + vpminsq %zmm15, %zmm6, %zmm9 + vpmaxsq %zmm15, %zmm6, %zmm6 + vpminsq %zmm11, %zmm0, %zmm15 + vpmaxsq %zmm11, %zmm0, %zmm0 + vpminsq %zmm5, %zmm16, %zmm11 + vpmaxsq %zmm5, %zmm16, %zmm16 + vpminsq %zmm3, %zmm10, %zmm5 + vpmaxsq %zmm3, %zmm10, %zmm10 + vpminsq %zmm4, %zmm14, %zmm3 + vpmaxsq %zmm4, %zmm14, %zmm14 + vpminsq %zmm12, %zmm13, %zmm4 + vpmaxsq %zmm12, %zmm13, %zmm13 + vpminsq %zmm2, %zmm7, %zmm12 + vpmaxsq %zmm2, %zmm7, %zmm7 + vpminsq %zmm15, %zmm1, %zmm2 + vpmaxsq %zmm15, %zmm1, %zmm1 + vpminsq %zmm11, %zmm9, %zmm15 + vpmaxsq %zmm11, %zmm9, %zmm9 + vpminsq %zmm0, %zmm8, %zmm11 + vpmaxsq %zmm0, %zmm8, %zmm8 + vpminsq %zmm16, %zmm6, %zmm0 + vpmaxsq %zmm16, %zmm6, %zmm6 + vpminsq %zmm2, %zmm5, %zmm17 + vpmaxsq %zmm2, %zmm5, %zmm5 + vpminsq %zmm15, %zmm3, %zmm2 + vpmaxsq %zmm15, %zmm3, %zmm3 + vpminsq %zmm11, %zmm4, %zmm15 + vpmaxsq %zmm11, %zmm4, %zmm4 + vpminsq %zmm0, %zmm12, %zmm11 + vpmaxsq %zmm0, %zmm12, %zmm12 + vpminsq %zmm1, %zmm10, %zmm0 + vpmaxsq %zmm1, %zmm10, %zmm10 + vpminsq %zmm9, %zmm14, %zmm1 + vpmaxsq %zmm9, %zmm14, %zmm14 + vpminsq %zmm8, %zmm13, %zmm9 + vpmaxsq %zmm8, %zmm13, %zmm13 + vpminsq %zmm6, %zmm7, %zmm8 + vpmaxsq %zmm6, %zmm7, %zmm7 + vpminsq %zmm4, %zmm1, %zmm6 + vpmaxsq %zmm4, %zmm1, %zmm1 + vpminsq %zmm3, %zmm9, %zmm4 + vpmaxsq %zmm3, %zmm9, %zmm9 + vpminsq %zmm10, %zmm11, %zmm3 + vpmaxsq %zmm10, %zmm11, %zmm11 + vpminsq %zmm12, %zmm8, %zmm10 + vpmaxsq %zmm12, %zmm8, %zmm8 + vpminsq %zmm13, %zmm14, %zmm12 + vpmaxsq %zmm13, %zmm14, %zmm14 + vpminsq %zmm5, %zmm0, %zmm13 + vpmaxsq %zmm5, %zmm0, %zmm0 + vpminsq %zmm15, %zmm2, %zmm5 + vpmaxsq %zmm15, %zmm2, %zmm2 + vpminsq %zmm13, %zmm5, %zmm16 + vpmaxsq %zmm13, %zmm5, %zmm5 + vpminsq %zmm12, %zmm10, %zmm13 + vpmaxsq %zmm12, %zmm10, %zmm10 + vpminsq %zmm0, %zmm2, %zmm12 + vpmaxsq %zmm0, %zmm2, %zmm2 + vpminsq %zmm14, %zmm8, %zmm0 + vpminsq %zmm5, %zmm12, %zmm15 + vpmaxsq %zmm5, %zmm12, %zmm12 + vpminsq %zmm4, %zmm6, %zmm5 + vpmaxsq %zmm4, %zmm6, %zmm6 + vpminsq %zmm1, %zmm9, %zmm4 + vpmaxsq %zmm1, %zmm9, %zmm9 + vpminsq %zmm10, %zmm0, %zmm1 + vpmaxsq %zmm10, %zmm0, %zmm0 + vpminsq %zmm2, %zmm3, %zmm10 + vpmaxsq %zmm2, %zmm3, %zmm3 + vpminsq %zmm11, %zmm13, %zmm2 + vpmaxsq %zmm11, %zmm13, %zmm13 + vpminsq %zmm5, %zmm10, %zmm11 + vpmaxsq %zmm5, %zmm10, %zmm10 + vpminsq %zmm3, %zmm6, %zmm5 + vpmaxsq %zmm3, %zmm6, %zmm6 + vpminsq %zmm4, %zmm2, %zmm3 + vpmaxsq %zmm4, %zmm2, %zmm2 + vpminsq %zmm13, %zmm9, %zmm4 + vpmaxsq %zmm13, %zmm9, %zmm9 + vpminsq %zmm5, %zmm10, %zmm13 + vpmaxsq %zmm5, %zmm10, %zmm10 + vpminsq %zmm6, %zmm3, %zmm5 + vpmaxsq %zmm6, %zmm3, %zmm3 + vpminsq %zmm4, %zmm2, %zmm6 + vpmaxsq %zmm14, %zmm8, %zmm8 + vpmaxsq %zmm4, %zmm2, %zmm2 + vpminsq %zmm12, %zmm11, %zmm14 + vpminsq %zmm9, %zmm1, %zmm4 + vpmaxsq %zmm12, %zmm11, %zmm11 + vpmaxsq %zmm9, %zmm1, %zmm1 + vpminsq %zmm5, %zmm10, %zmm12 + vpmaxsq %zmm5, %zmm10, %zmm10 + vpminsq %zmm6, %zmm3, %zmm5 + vpmaxsq %zmm6, %zmm3, %zmm3 + jbe .L123 + vpshufd $78, %zmm0, %zmm6 + vpshufd $78, %zmm7, %zmm7 + vpshufd $78, %zmm5, %zmm5 + movl $85, %r15d + vpshufd $78, %zmm3, %zmm3 + vpshufd $78, %zmm2, %zmm2 + vpshufd $78, %zmm4, %zmm4 + kmovb %r15d, %k1 + vpshufd $78, %zmm1, %zmm1 + vpshufd $78, %zmm8, %zmm8 + vpminsq %zmm7, %zmm17, %zmm0 + cmpq $3, -8(%rsp) + vpmaxsq %zmm7, %zmm17, %zmm9 + vpminsq %zmm8, %zmm16, %zmm17 + vpminsq %zmm2, %zmm13, %zmm7 + vpmaxsq %zmm8, %zmm16, %zmm8 + vpshufd $78, %zmm9, %zmm9 + vpminsq %zmm6, %zmm15, %zmm16 + vpmaxsq %zmm6, %zmm15, %zmm6 + vpshufd $78, %zmm8, %zmm8 + vpminsq %zmm1, %zmm14, %zmm15 + vpmaxsq %zmm1, %zmm14, %zmm1 + vpshufd $78, %zmm7, %zmm7 + vpminsq %zmm4, %zmm11, %zmm14 + vpmaxsq %zmm4, %zmm11, %zmm4 + vpshufd $78, %zmm1, %zmm1 + vpminsq %zmm3, %zmm12, %zmm11 + vpmaxsq %zmm3, %zmm12, %zmm3 + vpshufd $78, %zmm14, %zmm14 + vpminsq %zmm5, %zmm10, %zmm12 + vpmaxsq %zmm5, %zmm10, %zmm5 + vpshufd $78, %zmm11, %zmm10 + vpshufd $78, %zmm12, %zmm12 + vpmaxsq %zmm2, %zmm13, %zmm2 + vpmaxsq %zmm9, %zmm5, %zmm19 + vpminsq %zmm12, %zmm0, %zmm11 + vpmaxsq %zmm12, %zmm0, %zmm13 + vpshufd $78, %zmm6, %zmm6 + vpminsq %zmm9, %zmm5, %zmm0 + vpminsq %zmm14, %zmm15, %zmm12 + vpminsq %zmm8, %zmm3, %zmm9 + vpminsq %zmm10, %zmm17, %zmm5 + vpshufd $78, %zmm12, %zmm12 + vpmaxsq %zmm10, %zmm17, %zmm10 + vpmaxsq %zmm8, %zmm3, %zmm17 + vpminsq %zmm7, %zmm16, %zmm3 + vpminsq %zmm6, %zmm2, %zmm8 + vpshufd $78, %zmm10, %zmm10 + vpmaxsq %zmm6, %zmm2, %zmm6 + vpmaxsq %zmm14, %zmm15, %zmm14 + vpshufd $78, %zmm3, %zmm2 + vpminsq %zmm1, %zmm4, %zmm15 + vpshufd $78, %zmm17, %zmm3 + vpmaxsq %zmm1, %zmm4, %zmm4 + vpmaxsq %zmm12, %zmm11, %zmm17 + vpmaxsq %zmm7, %zmm16, %zmm7 + vpshufd $78, %zmm8, %zmm1 + vpshufd $78, %zmm13, %zmm16 + vpshufd $78, %zmm15, %zmm8 + vpminsq %zmm12, %zmm11, %zmm13 + vpshufd $78, %zmm19, %zmm15 + vpminsq %zmm2, %zmm5, %zmm11 + vpminsq %zmm16, %zmm14, %zmm12 + vpminsq %zmm8, %zmm0, %zmm19 + vpmaxsq %zmm16, %zmm14, %zmm14 + vpshufd $78, %zmm11, %zmm11 + vpmaxsq %zmm8, %zmm0, %zmm16 + vpminsq %zmm1, %zmm9, %zmm0 + vpminsq %zmm15, %zmm4, %zmm8 + vpmaxsq %zmm15, %zmm4, %zmm15 + vpshufd $78, %zmm0, %zmm4 + vpmaxsq %zmm2, %zmm5, %zmm5 + vpshufd $78, %zmm16, %zmm0 + vpminsq %zmm10, %zmm7, %zmm2 + vpshufd $78, %zmm15, %zmm16 + vpmaxsq %zmm10, %zmm7, %zmm7 + vpminsq %zmm11, %zmm13, %zmm15 + vpshufd $78, %zmm17, %zmm10 + vpmaxsq %zmm1, %zmm9, %zmm1 + vpminsq %zmm3, %zmm6, %zmm9 + vpmaxsq %zmm3, %zmm6, %zmm6 + vpshufd $78, %zmm14, %zmm3 + vpmaxsq %zmm11, %zmm13, %zmm14 + vpminsq %zmm10, %zmm5, %zmm11 + vpmaxsq %zmm10, %zmm5, %zmm5 + vpshufd $78, %zmm15, %zmm10 + vpmaxsq %zmm10, %zmm15, %zmm17 + vpshufd $78, %zmm2, %zmm2 + vpshufd $78, %zmm9, %zmm9 + vpminsq %zmm10, %zmm15, %zmm17{%k1} + vpshufd $78, %zmm14, %zmm10 + vpminsq %zmm2, %zmm12, %zmm13 + vpmaxsq %zmm2, %zmm12, %zmm12 + vpminsq %zmm3, %zmm7, %zmm2 + vpmaxsq %zmm3, %zmm7, %zmm7 + vpminsq %zmm4, %zmm19, %zmm3 + vpmaxsq %zmm4, %zmm19, %zmm19 + vpminsq %zmm0, %zmm1, %zmm4 + vpmaxsq %zmm0, %zmm1, %zmm1 + vpminsq %zmm9, %zmm8, %zmm0 + vpmaxsq %zmm9, %zmm8, %zmm8 + vpminsq %zmm16, %zmm6, %zmm9 + vpmaxsq %zmm16, %zmm6, %zmm6 + vpmaxsq %zmm10, %zmm14, %zmm16 + vpminsq %zmm10, %zmm14, %zmm16{%k1} + vpshufd $78, %zmm11, %zmm10 + vpmaxsq %zmm10, %zmm11, %zmm15 + vpminsq %zmm10, %zmm11, %zmm15{%k1} + vpshufd $78, %zmm5, %zmm10 + vpmaxsq %zmm10, %zmm5, %zmm14 + vpminsq %zmm10, %zmm5, %zmm14{%k1} + vpshufd $78, %zmm13, %zmm5 + vpmaxsq %zmm5, %zmm13, %zmm11 + vpminsq %zmm5, %zmm13, %zmm11{%k1} + vpshufd $78, %zmm12, %zmm5 + vpmaxsq %zmm5, %zmm12, %zmm13 + vpminsq %zmm5, %zmm12, %zmm13{%k1} + vpshufd $78, %zmm2, %zmm5 + vpmaxsq %zmm5, %zmm2, %zmm12 + vpminsq %zmm5, %zmm2, %zmm12{%k1} + vpshufd $78, %zmm7, %zmm2 + vpmaxsq %zmm2, %zmm7, %zmm10 + vpminsq %zmm2, %zmm7, %zmm10{%k1} + vpshufd $78, %zmm3, %zmm2 + vpshufd $78, %zmm4, %zmm7 + vpmaxsq %zmm2, %zmm3, %zmm5 + vpminsq %zmm2, %zmm3, %zmm5{%k1} + vpshufd $78, %zmm19, %zmm2 + vpmaxsq %zmm2, %zmm19, %zmm3 + vpminsq %zmm2, %zmm19, %zmm3{%k1} + vpmaxsq %zmm7, %zmm4, %zmm2 + vpminsq %zmm7, %zmm4, %zmm2{%k1} + vpshufd $78, %zmm1, %zmm7 + vpmaxsq %zmm7, %zmm1, %zmm4 + vpminsq %zmm7, %zmm1, %zmm4{%k1} + vpshufd $78, %zmm0, %zmm7 + vpmaxsq %zmm7, %zmm0, %zmm1 + vpminsq %zmm7, %zmm0, %zmm1{%k1} + vpshufd $78, %zmm8, %zmm7 + vpmaxsq %zmm7, %zmm8, %zmm0 + vpminsq %zmm7, %zmm8, %zmm0{%k1} + vpshufd $78, %zmm9, %zmm7 + vpmaxsq %zmm7, %zmm9, %zmm8 + vpminsq %zmm7, %zmm9, %zmm8{%k1} + vpshufd $78, %zmm6, %zmm9 + vpmaxsq %zmm9, %zmm6, %zmm7 + vpminsq %zmm9, %zmm6, %zmm7{%k1} + jbe .L123 + vpermq $27, %zmm5, %zmm5 + vpermq $27, %zmm3, %zmm3 + vpermq $27, %zmm2, %zmm2 + movl $51, %r15d + vpermq $27, %zmm4, %zmm4 + vpermq $27, %zmm1, %zmm1 + kmovb %r15d, %k2 + vpermq $27, %zmm0, %zmm0 + vpermq $27, %zmm8, %zmm8 + vpermq $27, %zmm7, %zmm7 + vpminsq %zmm2, %zmm13, %zmm6 + cmpq $7, -8(%rsp) + vpminsq %zmm7, %zmm17, %zmm9 + vpmaxsq %zmm7, %zmm17, %zmm7 + vpermq $27, %zmm6, %zmm6 + vpminsq %zmm8, %zmm16, %zmm17 + vpmaxsq %zmm8, %zmm16, %zmm8 + vpermq $27, %zmm7, %zmm7 + vpminsq %zmm0, %zmm15, %zmm16 + vpmaxsq %zmm0, %zmm15, %zmm0 + vpermq $27, %zmm8, %zmm8 + vpminsq %zmm1, %zmm14, %zmm15 + vpmaxsq %zmm1, %zmm14, %zmm1 + vpermq $27, %zmm0, %zmm0 + vpminsq %zmm4, %zmm11, %zmm14 + vpmaxsq %zmm4, %zmm11, %zmm4 + vpermq $27, %zmm1, %zmm1 + vpminsq %zmm3, %zmm12, %zmm11 + vpmaxsq %zmm3, %zmm12, %zmm3 + vpermq $27, %zmm14, %zmm14 + vpminsq %zmm5, %zmm10, %zmm12 + vpmaxsq %zmm5, %zmm10, %zmm5 + vpermq $27, %zmm11, %zmm10 + vpermq $27, %zmm12, %zmm12 + vpminsq %zmm7, %zmm5, %zmm11 + vpmaxsq %zmm2, %zmm13, %zmm2 + vpminsq %zmm12, %zmm9, %zmm19 + vpmaxsq %zmm12, %zmm9, %zmm13 + vpmaxsq %zmm7, %zmm5, %zmm20 + vpminsq %zmm8, %zmm3, %zmm9 + vpminsq %zmm10, %zmm17, %zmm5 + vpmaxsq %zmm10, %zmm17, %zmm10 + vpmaxsq %zmm8, %zmm3, %zmm17 + vpminsq %zmm6, %zmm16, %zmm3 + vpermq $27, %zmm10, %zmm10 + vpminsq %zmm14, %zmm15, %zmm8 + vpminsq %zmm0, %zmm2, %zmm7 + vpmaxsq %zmm14, %zmm15, %zmm14 + vpmaxsq %zmm0, %zmm2, %zmm0 + vpermq $27, %zmm3, %zmm2 + vpminsq %zmm1, %zmm4, %zmm15 + vpmaxsq %zmm1, %zmm4, %zmm12 + vpermq $27, %zmm8, %zmm3 + vpmaxsq %zmm6, %zmm16, %zmm6 + vpermq $27, %zmm13, %zmm1 + vpermq $27, %zmm15, %zmm16 + vpermq $27, %zmm17, %zmm8 + vpminsq %zmm2, %zmm5, %zmm17 + vpminsq %zmm3, %zmm19, %zmm13 + vpminsq %zmm16, %zmm11, %zmm4 + vpermq $27, %zmm7, %zmm7 + vpermq $27, %zmm20, %zmm15 + vpmaxsq %zmm3, %zmm19, %zmm19 + vpmaxsq %zmm16, %zmm11, %zmm20 + vpermq $27, %zmm17, %zmm11 + vpmaxsq %zmm2, %zmm5, %zmm3 + vpminsq %zmm1, %zmm14, %zmm5 + vpmaxsq %zmm1, %zmm14, %zmm14 + vpminsq %zmm7, %zmm9, %zmm16 + vpmaxsq %zmm10, %zmm6, %zmm1 + vpminsq %zmm10, %zmm6, %zmm2 + vpermq $27, %zmm19, %zmm10 + vpermq $27, %zmm14, %zmm6 + vpmaxsq %zmm7, %zmm9, %zmm9 + vpminsq %zmm11, %zmm13, %zmm14 + vpminsq %zmm15, %zmm12, %zmm7 + vpmaxsq %zmm15, %zmm12, %zmm15 + vpermq $27, %zmm16, %zmm17 + vpminsq %zmm8, %zmm0, %zmm12 + vpmaxsq %zmm11, %zmm13, %zmm13 + vpermq $27, %zmm2, %zmm2 + vpminsq %zmm10, %zmm3, %zmm11 + vpermq $27, %zmm12, %zmm16 + vpmaxsq %zmm10, %zmm3, %zmm3 + vpmaxsq %zmm8, %zmm0, %zmm0 + vpermq $27, %zmm14, %zmm10 + vpermq $27, %zmm20, %zmm8 + vpminsq %zmm17, %zmm4, %zmm19 + vpmaxsq %zmm17, %zmm4, %zmm4 + vpermq $27, %zmm15, %zmm15 + vpminsq %zmm8, %zmm9, %zmm17 + vpmaxsq %zmm8, %zmm9, %zmm9 + vpminsq %zmm16, %zmm7, %zmm8 + vpmaxsq %zmm16, %zmm7, %zmm7 + vpmaxsq %zmm10, %zmm14, %zmm16 + vpminsq %zmm2, %zmm5, %zmm12 + vpminsq %zmm10, %zmm14, %zmm16{%k2} + vpermq $27, %zmm13, %zmm10 + vpmaxsq %zmm2, %zmm5, %zmm5 + vpminsq %zmm6, %zmm1, %zmm2 + vpmaxsq %zmm6, %zmm1, %zmm1 + vpminsq %zmm15, %zmm0, %zmm6 + vpmaxsq %zmm15, %zmm0, %zmm0 + vpmaxsq %zmm10, %zmm13, %zmm15 + vpminsq %zmm10, %zmm13, %zmm15{%k2} + vpermq $27, %zmm11, %zmm10 + vpmaxsq %zmm10, %zmm11, %zmm14 + vpminsq %zmm10, %zmm11, %zmm14{%k2} + vpermq $27, %zmm3, %zmm10 + vpmaxsq %zmm10, %zmm3, %zmm11 + vpminsq %zmm10, %zmm3, %zmm11{%k2} + vpermq $27, %zmm12, %zmm3 + vpmaxsq %zmm3, %zmm12, %zmm13 + vpminsq %zmm3, %zmm12, %zmm13{%k2} + vpermq $27, %zmm5, %zmm3 + vpmaxsq %zmm3, %zmm5, %zmm12 + vpminsq %zmm3, %zmm5, %zmm12{%k2} + vpermq $27, %zmm2, %zmm3 + vpmaxsq %zmm3, %zmm2, %zmm10 + vpminsq %zmm3, %zmm2, %zmm10{%k2} + vpermq $27, %zmm1, %zmm2 + vpmaxsq %zmm2, %zmm1, %zmm5 + vpminsq %zmm2, %zmm1, %zmm5{%k2} + vpermq $27, %zmm19, %zmm1 + vpmaxsq %zmm1, %zmm19, %zmm3 + vpminsq %zmm1, %zmm19, %zmm3{%k2} + vpermq $27, %zmm4, %zmm1 + vpmaxsq %zmm1, %zmm4, %zmm2 + vpminsq %zmm1, %zmm4, %zmm2{%k2} + vpermq $27, %zmm17, %zmm1 + vpmaxsq %zmm1, %zmm17, %zmm4 + vpminsq %zmm1, %zmm17, %zmm4{%k2} + vpermq $27, %zmm9, %zmm17 + vpmaxsq %zmm17, %zmm9, %zmm1 + vpminsq %zmm17, %zmm9, %zmm1{%k2} + vpermq $27, %zmm8, %zmm17 + vpmaxsq %zmm17, %zmm8, %zmm9 + vpminsq %zmm17, %zmm8, %zmm9{%k2} + vpermq $27, %zmm7, %zmm17 + vpmaxsq %zmm17, %zmm7, %zmm8 + vpminsq %zmm17, %zmm7, %zmm8{%k2} + vpermq $27, %zmm6, %zmm17 + vpmaxsq %zmm17, %zmm6, %zmm7 + vpminsq %zmm17, %zmm6, %zmm7{%k2} + vpermq $27, %zmm0, %zmm17 + vpmaxsq %zmm17, %zmm0, %zmm6 + vpminsq %zmm17, %zmm0, %zmm6{%k2} + vpshufd $78, %zmm16, %zmm0 + vpmaxsq %zmm0, %zmm16, %zmm17 + vpminsq %zmm0, %zmm16, %zmm17{%k1} + vpshufd $78, %zmm15, %zmm0 + vpmaxsq %zmm0, %zmm15, %zmm16 + vpminsq %zmm0, %zmm15, %zmm16{%k1} + vpshufd $78, %zmm14, %zmm0 + vpmaxsq %zmm0, %zmm14, %zmm15 + vpminsq %zmm0, %zmm14, %zmm15{%k1} + vpshufd $78, %zmm11, %zmm0 + vpmaxsq %zmm0, %zmm11, %zmm14 + vpminsq %zmm0, %zmm11, %zmm14{%k1} + vpshufd $78, %zmm13, %zmm0 + vpmaxsq %zmm0, %zmm13, %zmm11 + vpminsq %zmm0, %zmm13, %zmm11{%k1} + vpshufd $78, %zmm12, %zmm0 + vpmaxsq %zmm0, %zmm12, %zmm13 + vpminsq %zmm0, %zmm12, %zmm13{%k1} + vpshufd $78, %zmm10, %zmm0 + vpmaxsq %zmm0, %zmm10, %zmm12 + vpminsq %zmm0, %zmm10, %zmm12{%k1} + vpshufd $78, %zmm5, %zmm0 + vpmaxsq %zmm0, %zmm5, %zmm10 + vpminsq %zmm0, %zmm5, %zmm10{%k1} + vpshufd $78, %zmm3, %zmm0 + vpmaxsq %zmm0, %zmm3, %zmm5 + vpminsq %zmm0, %zmm3, %zmm5{%k1} + vpshufd $78, %zmm2, %zmm0 + vpmaxsq %zmm0, %zmm2, %zmm3 + vpminsq %zmm0, %zmm2, %zmm3{%k1} + vpshufd $78, %zmm4, %zmm0 + vpmaxsq %zmm0, %zmm4, %zmm2 + vpminsq %zmm0, %zmm4, %zmm2{%k1} + vpshufd $78, %zmm1, %zmm0 + vpmaxsq %zmm0, %zmm1, %zmm4 + vpminsq %zmm0, %zmm1, %zmm4{%k1} + vpshufd $78, %zmm9, %zmm0 + vpmaxsq %zmm0, %zmm9, %zmm1 + vpminsq %zmm0, %zmm9, %zmm1{%k1} + vpshufd $78, %zmm8, %zmm9 + vpmaxsq %zmm9, %zmm8, %zmm0 + vpminsq %zmm9, %zmm8, %zmm0{%k1} + vpshufd $78, %zmm7, %zmm9 + vpmaxsq %zmm9, %zmm7, %zmm8 + vpminsq %zmm9, %zmm7, %zmm8{%k1} + vpshufd $78, %zmm6, %zmm9 + vpmaxsq %zmm9, %zmm6, %zmm7 + vpminsq %zmm9, %zmm6, %zmm7{%k1} + jbe .L123 + vmovdqa64 .LC2(%rip), %zmm6 + movl $65535, %r15d + kmovd %r15d, %k3 + vpermq %zmm7, %zmm6, %zmm7 + vpermq %zmm5, %zmm6, %zmm5 + vpermq %zmm3, %zmm6, %zmm3 + vpermq %zmm2, %zmm6, %zmm2 + vpermq %zmm4, %zmm6, %zmm4 + vpermq %zmm1, %zmm6, %zmm1 + vpermq %zmm0, %zmm6, %zmm0 + vpermq %zmm8, %zmm6, %zmm8 + vpminsq %zmm7, %zmm17, %zmm9 + vpmaxsq %zmm7, %zmm17, %zmm19 + vpminsq %zmm8, %zmm16, %zmm7 + vpmaxsq %zmm8, %zmm16, %zmm8 + vpminsq %zmm0, %zmm15, %zmm16 + vpmaxsq %zmm0, %zmm15, %zmm0 + vpminsq %zmm1, %zmm14, %zmm15 + vpermq %zmm8, %zmm6, %zmm8 + vpmaxsq %zmm1, %zmm14, %zmm1 + vpminsq %zmm4, %zmm11, %zmm14 + vpermq %zmm0, %zmm6, %zmm0 + vpmaxsq %zmm4, %zmm11, %zmm4 + vpminsq %zmm2, %zmm13, %zmm11 + vpermq %zmm14, %zmm6, %zmm14 + vpmaxsq %zmm2, %zmm13, %zmm2 + vpminsq %zmm3, %zmm12, %zmm13 + vpermq %zmm11, %zmm6, %zmm11 + vpmaxsq %zmm3, %zmm12, %zmm3 + vpminsq %zmm5, %zmm10, %zmm12 + vpermq %zmm13, %zmm6, %zmm17 + vpmaxsq %zmm5, %zmm10, %zmm5 + vpermq %zmm19, %zmm6, %zmm13 + vpermq %zmm12, %zmm6, %zmm12 + vpminsq %zmm12, %zmm9, %zmm10 + vpmaxsq %zmm12, %zmm9, %zmm19 + vpermq %zmm1, %zmm6, %zmm1 + vpminsq %zmm13, %zmm5, %zmm12 + vpmaxsq %zmm13, %zmm5, %zmm20 + vpminsq %zmm8, %zmm3, %zmm9 + vpminsq %zmm17, %zmm7, %zmm13 + vpminsq %zmm11, %zmm16, %zmm5 + vpmaxsq %zmm17, %zmm7, %zmm7 + vpmaxsq %zmm8, %zmm3, %zmm17 + vpmaxsq %zmm11, %zmm16, %zmm3 + vpermq %zmm19, %zmm6, %zmm11 + vpminsq %zmm14, %zmm15, %zmm16 + vpminsq %zmm0, %zmm2, %zmm8 + vpmaxsq %zmm14, %zmm15, %zmm14 + vpermq %zmm16, %zmm6, %zmm16 + vpermq %zmm5, %zmm6, %zmm15 + vpmaxsq %zmm0, %zmm2, %zmm0 + vpminsq %zmm1, %zmm4, %zmm2 + vpermq %zmm20, %zmm6, %zmm5 + vpminsq %zmm16, %zmm10, %zmm19 + vpmaxsq %zmm1, %zmm4, %zmm1 + vpermq %zmm7, %zmm6, %zmm4 + vpermq %zmm8, %zmm6, %zmm7 + vpermq %zmm2, %zmm6, %zmm8 + vpermq %zmm17, %zmm6, %zmm2 + vpmaxsq %zmm16, %zmm10, %zmm17 + vpminsq %zmm15, %zmm13, %zmm16 + vpminsq %zmm11, %zmm14, %zmm10 + vpmaxsq %zmm15, %zmm13, %zmm13 + vpmaxsq %zmm8, %zmm12, %zmm20 + vpmaxsq %zmm11, %zmm14, %zmm15 + vpermq %zmm16, %zmm6, %zmm11 + vpminsq %zmm4, %zmm3, %zmm14 + vpmaxsq %zmm5, %zmm1, %zmm22 + vpmaxsq %zmm4, %zmm3, %zmm3 + vpminsq %zmm2, %zmm0, %zmm21 + vpermq %zmm22, %zmm6, %zmm16 + vpminsq %zmm8, %zmm12, %zmm4 + vpminsq %zmm7, %zmm9, %zmm8 + vpermq %zmm17, %zmm6, %zmm12 + vpmaxsq %zmm7, %zmm9, %zmm9 + vpermq %zmm21, %zmm6, %zmm17 + vpminsq %zmm5, %zmm1, %zmm7 + vpmaxsq %zmm2, %zmm0, %zmm0 + vpermq %zmm14, %zmm6, %zmm5 + vpermq %zmm15, %zmm6, %zmm2 + vpermq %zmm8, %zmm6, %zmm1 + vpminsq %zmm11, %zmm19, %zmm15 + vpermq %zmm20, %zmm6, %zmm8 + vpmaxsq %zmm11, %zmm19, %zmm14 + vpminsq %zmm12, %zmm13, %zmm11 + vpmaxsq %zmm12, %zmm13, %zmm13 + vpminsq %zmm5, %zmm10, %zmm12 + vpmaxsq %zmm5, %zmm10, %zmm10 + vpminsq %zmm2, %zmm3, %zmm5 + vpmaxsq %zmm2, %zmm3, %zmm3 + vpminsq %zmm1, %zmm4, %zmm2 + vpmaxsq %zmm1, %zmm4, %zmm4 + vpminsq %zmm8, %zmm9, %zmm1 + vpmaxsq %zmm8, %zmm9, %zmm9 + vpminsq %zmm17, %zmm7, %zmm8 + vpmaxsq %zmm17, %zmm7, %zmm7 + vpminsq %zmm16, %zmm0, %zmm17 + vpmaxsq %zmm16, %zmm0, %zmm0 + vpermq %zmm15, %zmm6, %zmm16 + vpminsq %zmm16, %zmm15, %zmm19 + vpmaxsq %zmm16, %zmm15, %zmm15 + vpermq %zmm14, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm15{%k3} + vpminsq %zmm16, %zmm14, %zmm19 + vpmaxsq %zmm16, %zmm14, %zmm14 + vpermq %zmm11, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm14{%k3} + vpminsq %zmm16, %zmm11, %zmm19 + vpmaxsq %zmm16, %zmm11, %zmm11 + vpermq %zmm13, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm11{%k3} + vpminsq %zmm16, %zmm13, %zmm19 + vpmaxsq %zmm16, %zmm13, %zmm13 + vpermq %zmm12, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm13{%k3} + vpminsq %zmm16, %zmm12, %zmm19 + vpmaxsq %zmm16, %zmm12, %zmm12 + vpermq %zmm10, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm12{%k3} + vpminsq %zmm16, %zmm10, %zmm19 + vpmaxsq %zmm16, %zmm10, %zmm10 + vpermq %zmm5, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm10{%k3} + vpminsq %zmm16, %zmm5, %zmm19 + vpmaxsq %zmm16, %zmm5, %zmm5 + vpermq %zmm3, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm5{%k3} + vpminsq %zmm16, %zmm3, %zmm19 + vpmaxsq %zmm16, %zmm3, %zmm3 + vpermq %zmm2, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm3{%k3} + vpminsq %zmm16, %zmm2, %zmm19 + vpmaxsq %zmm16, %zmm2, %zmm2 + vpermq %zmm4, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm2{%k3} + vpminsq %zmm16, %zmm4, %zmm19 + vpmaxsq %zmm16, %zmm4, %zmm4 + vpermq %zmm1, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm4{%k3} + vpminsq %zmm16, %zmm1, %zmm19 + vpmaxsq %zmm16, %zmm1, %zmm1 + vpermq %zmm9, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm1{%k3} + vpminsq %zmm16, %zmm9, %zmm19 + vpmaxsq %zmm16, %zmm9, %zmm9 + vpermq %zmm8, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm9{%k3} + vpminsq %zmm16, %zmm8, %zmm19 + vpmaxsq %zmm16, %zmm8, %zmm8 + vpermq %zmm7, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm8{%k3} + vpminsq %zmm16, %zmm7, %zmm19 + vpmaxsq %zmm16, %zmm7, %zmm7 + vpermq %zmm17, %zmm6, %zmm16 + vpermq %zmm0, %zmm6, %zmm6 + vmovdqu16 %zmm19, %zmm7{%k3} + vpminsq %zmm16, %zmm17, %zmm19 + vpmaxsq %zmm16, %zmm17, %zmm17 + vpminsq %zmm6, %zmm0, %zmm16 + vpmaxsq %zmm6, %zmm0, %zmm0 + vshufi32x4 $177, %zmm15, %zmm15, %zmm6 + vmovdqu16 %zmm16, %zmm0{%k3} + vpmaxsq %zmm15, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm17{%k3} + vpminsq %zmm15, %zmm6, %zmm16{%k2} + vshufi32x4 $177, %zmm14, %zmm14, %zmm6 + vpmaxsq %zmm14, %zmm6, %zmm15 + vpminsq %zmm14, %zmm6, %zmm15{%k2} + vshufi32x4 $177, %zmm11, %zmm11, %zmm6 + vpmaxsq %zmm11, %zmm6, %zmm14 + vpminsq %zmm11, %zmm6, %zmm14{%k2} + vshufi32x4 $177, %zmm13, %zmm13, %zmm6 + vpmaxsq %zmm13, %zmm6, %zmm11 + vpminsq %zmm13, %zmm6, %zmm11{%k2} + vshufi32x4 $177, %zmm12, %zmm12, %zmm6 + vpmaxsq %zmm12, %zmm6, %zmm13 + vpminsq %zmm12, %zmm6, %zmm13{%k2} + vshufi32x4 $177, %zmm10, %zmm10, %zmm6 + vpmaxsq %zmm10, %zmm6, %zmm12 + vpminsq %zmm10, %zmm6, %zmm12{%k2} + vshufi32x4 $177, %zmm5, %zmm5, %zmm6 + vpmaxsq %zmm5, %zmm6, %zmm10 + vpminsq %zmm5, %zmm6, %zmm10{%k2} + vshufi32x4 $177, %zmm3, %zmm3, %zmm6 + vpmaxsq %zmm3, %zmm6, %zmm5 + vpminsq %zmm3, %zmm6, %zmm5{%k2} + vshufi32x4 $177, %zmm2, %zmm2, %zmm6 + vpmaxsq %zmm2, %zmm6, %zmm3 + vpminsq %zmm2, %zmm6, %zmm3{%k2} + vshufi32x4 $177, %zmm4, %zmm4, %zmm6 + vpmaxsq %zmm4, %zmm6, %zmm2 + vpminsq %zmm4, %zmm6, %zmm2{%k2} + vshufi32x4 $177, %zmm1, %zmm1, %zmm6 + vpmaxsq %zmm1, %zmm6, %zmm4 + vpminsq %zmm1, %zmm6, %zmm4{%k2} + vshufi32x4 $177, %zmm9, %zmm9, %zmm6 + vpmaxsq %zmm9, %zmm6, %zmm1 + vpminsq %zmm9, %zmm6, %zmm1{%k2} + vshufi32x4 $177, %zmm8, %zmm8, %zmm6 + vpmaxsq %zmm8, %zmm6, %zmm9 + vpminsq %zmm8, %zmm6, %zmm9{%k2} + vshufi32x4 $177, %zmm7, %zmm7, %zmm6 + vpmaxsq %zmm7, %zmm6, %zmm8 + vpminsq %zmm7, %zmm6, %zmm8{%k2} + vshufi32x4 $177, %zmm17, %zmm17, %zmm6 + vpmaxsq %zmm17, %zmm6, %zmm7 + vpminsq %zmm17, %zmm6, %zmm7{%k2} + vshufi32x4 $177, %zmm0, %zmm0, %zmm17 + vpmaxsq %zmm0, %zmm17, %zmm6 + vpminsq %zmm0, %zmm17, %zmm6{%k2} + vpshufd $78, %zmm16, %zmm0 + vpmaxsq %zmm0, %zmm16, %zmm17 + vpminsq %zmm0, %zmm16, %zmm17{%k1} + vpshufd $78, %zmm15, %zmm0 + vpmaxsq %zmm0, %zmm15, %zmm16 + vpminsq %zmm0, %zmm15, %zmm16{%k1} + vpshufd $78, %zmm14, %zmm0 + vpmaxsq %zmm0, %zmm14, %zmm15 + vpminsq %zmm0, %zmm14, %zmm15{%k1} + vpshufd $78, %zmm11, %zmm0 + vpmaxsq %zmm0, %zmm11, %zmm14 + vpminsq %zmm0, %zmm11, %zmm14{%k1} + vpshufd $78, %zmm13, %zmm0 + vpmaxsq %zmm0, %zmm13, %zmm11 + vpminsq %zmm0, %zmm13, %zmm11{%k1} + vpshufd $78, %zmm12, %zmm0 + vpmaxsq %zmm0, %zmm12, %zmm13 + vpminsq %zmm0, %zmm12, %zmm13{%k1} + vpshufd $78, %zmm10, %zmm0 + vpmaxsq %zmm0, %zmm10, %zmm12 + vpminsq %zmm0, %zmm10, %zmm12{%k1} + vpshufd $78, %zmm5, %zmm0 + vpmaxsq %zmm0, %zmm5, %zmm10 + vpminsq %zmm0, %zmm5, %zmm10{%k1} + vpshufd $78, %zmm3, %zmm0 + vpmaxsq %zmm0, %zmm3, %zmm5 + vpminsq %zmm0, %zmm3, %zmm5{%k1} + vpshufd $78, %zmm2, %zmm0 + vpmaxsq %zmm0, %zmm2, %zmm3 + vpminsq %zmm0, %zmm2, %zmm3{%k1} + vpshufd $78, %zmm4, %zmm0 + vpmaxsq %zmm0, %zmm4, %zmm2 + vpminsq %zmm0, %zmm4, %zmm2{%k1} + vpshufd $78, %zmm1, %zmm0 + vpmaxsq %zmm0, %zmm1, %zmm4 + vpminsq %zmm0, %zmm1, %zmm4{%k1} + vpshufd $78, %zmm9, %zmm0 + vpmaxsq %zmm0, %zmm9, %zmm1 + vpminsq %zmm0, %zmm9, %zmm1{%k1} + vpshufd $78, %zmm8, %zmm9 + vpmaxsq %zmm9, %zmm8, %zmm0 + vpminsq %zmm9, %zmm8, %zmm0{%k1} + vpshufd $78, %zmm7, %zmm9 + vpmaxsq %zmm9, %zmm7, %zmm8 + vpminsq %zmm9, %zmm7, %zmm8{%k1} + vpshufd $78, %zmm6, %zmm9 + vpmaxsq %zmm9, %zmm6, %zmm7 + vpminsq %zmm9, %zmm6, %zmm7{%k1} +.L123: + vmovdqu64 %zmm17, (%rdi) + vmovq %xmm18, %rdi + vmovdqu64 %zmm16, (%rdi) + vmovdqu64 %zmm15, (%r14) + vmovdqu64 %zmm14, 0(%r13) + vmovdqu64 %zmm11, (%r12) + vmovdqu64 %zmm13, (%rbx) + vmovq %xmm23, %rbx + vmovdqu64 %zmm12, (%r11) + vmovdqu64 %zmm10, (%r10) + vmovdqu64 %zmm5, (%r9) + vmovdqu64 %zmm3, (%r8) + vmovdqu64 %zmm2, (%rbx) + vmovdqu64 %zmm4, (%rsi) + vmovdqu64 %zmm1, (%rcx) + vmovdqu64 %zmm0, (%rdx) + vmovdqu64 %zmm8, (%rax) + movq -16(%rsp), %rax + vmovdqu64 %zmm7, (%rax) + vzeroupper + leaq -40(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE18785: + .size _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0, .-_ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + .section .text._ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, @function +_ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0: +.LFB18786: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsi, %r10 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %rbx + .cfi_offset 3, -24 + cmpq $7, %rsi + jbe .L141 + movl $8, %r8d + xorl %esi, %esi + jmp .L132 + .p2align 4,,10 + .p2align 3 +.L127: + vmovdqu64 %zmm0, (%rax) + kmovb %k0, %eax + popcntq %rax, %rax + addq %rax, %rsi + leaq 8(%r8), %rax + cmpq %r10, %rax + ja .L152 + movq %rax, %r8 +.L132: + vmovdqu64 -64(%rdi,%r8,8), %zmm3 + leaq -8(%r8), %r9 + leaq (%rdi,%rsi,8), %rax + vpcmpq $0, %zmm0, %zmm3, %k0 + vpcmpq $0, %zmm1, %zmm3, %k1 + kmovb %k0, %r11d + kmovb %k1, %ebx + korb %k1, %k0, %k1 + kortestb %k1, %k1 + jc .L127 + kmovb %r11d, %k6 + kmovb %ebx, %k5 + kxnorb %k5, %k6, %k7 + kmovb %k7, %eax + tzcntl %eax, %eax + addq %r9, %rax + vpbroadcastq (%rdi,%rax,8), %zmm0 + leaq 8(%rsi), %rax + vmovdqa64 %zmm0, (%rdx) + cmpq %r9, %rax + ja .L128 + .p2align 4,,10 + .p2align 3 +.L129: + vmovdqu64 %zmm1, -64(%rdi,%rax,8) + movq %rax, %rsi + addq $8, %rax + cmpq %rax, %r9 + jnb .L129 +.L128: + subq %rsi, %r9 + leaq (%rdi,%rsi,8), %rdx + movl $255, %eax + cmpq $255, %r9 + jbe .L153 +.L130: + kmovb %eax, %k4 + xorl %eax, %eax + vmovdqu64 %zmm1, (%rdx){%k4} +.L125: + movq -8(%rbp), %rbx + leave + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L153: + .cfi_restore_state + movq $-1, %rax + bzhi %r9, %rax, %rax + movzbl %al, %eax + jmp .L130 + .p2align 4,,10 + .p2align 3 +.L152: + movq %r10, %r11 + leaq (%rdi,%r8,8), %rbx + leaq (%rdi,%rsi,8), %r9 + movl $255, %eax + subq %r8, %r11 + kmovd %eax, %k1 + cmpq $255, %r11 + jbe .L126 +.L133: + vmovdqu64 (%rbx), %zmm2{%k1}{z} + knotb %k1, %k3 + vmovdqu64 %zmm2, (%rcx){%k1} + vmovdqa64 (%rcx), %zmm2 + vpcmpq $0, %zmm0, %zmm2, %k0 + vpcmpq $0, %zmm1, %zmm2, %k2 + kandb %k1, %k0, %k0 + korb %k2, %k0, %k2 + korb %k3, %k2, %k2 + kortestb %k2, %k2 + jnc .L154 + kmovb %k0, %edx + popcntq %rdx, %rdx + addq %rsi, %rdx + vmovdqu64 %zmm0, (%r9){%k1} + leaq 8(%rdx), %rax + cmpq %r10, %rax + ja .L138 + .p2align 4,,10 + .p2align 3 +.L139: + vmovdqu64 %zmm1, -64(%rdi,%rax,8) + movq %rax, %rdx + addq $8, %rax + cmpq %rax, %r10 + jnb .L139 +.L138: + subq %rdx, %r10 + leaq (%rdi,%rdx,8), %rcx + movl $255, %eax + cmpq $255, %r10 + ja .L140 + movq $-1, %rax + bzhi %r10, %rax, %rax + movzbl %al, %eax +.L140: + kmovb %eax, %k5 + movl $1, %eax + vmovdqu64 %zmm1, (%rcx){%k5} + movq -8(%rbp), %rbx + leave + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret +.L141: + .cfi_restore_state + movq %rsi, %r11 + movq %rdi, %r9 + movq %rdi, %rbx + xorl %r8d, %r8d + xorl %esi, %esi +.L126: + movq $-1, %rax + bzhi %r11, %rax, %rax + movzbl %al, %eax + kmovd %eax, %k1 + jmp .L133 +.L154: + knotb %k2, %k3 + kmovb %k3, %eax + tzcntl %eax, %eax + addq %r8, %rax + vpbroadcastq (%rdi,%rax,8), %zmm0 + leaq 8(%rsi), %rax + vmovdqa64 %zmm0, (%rdx) + cmpq %r8, %rax + ja .L135 + .p2align 4,,10 + .p2align 3 +.L136: + vmovdqu64 %zmm1, -64(%rdi,%rax,8) + movq %rax, %rsi + leaq 8(%rax), %rax + cmpq %rax, %r8 + jnb .L136 + leaq (%rdi,%rsi,8), %r9 +.L135: + subq %rsi, %r8 + movl $255, %eax + cmpq $255, %r8 + ja .L137 + movq $-1, %rax + bzhi %r8, %rax, %rax + movzbl %al, %eax +.L137: + kmovb %eax, %k6 + xorl %eax, %eax + vmovdqu64 %zmm1, (%r9){%k6} + jmp .L125 + .cfi_endproc +.LFE18786: + .size _ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, .-_ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + .section .text._ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0, @function +_ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0: +.LFB18787: + .cfi_startproc + movq %rsi, %r8 + movq %rdx, %rcx + cmpq %rdx, %rsi + jbe .L165 + leaq (%rdx,%rdx), %rdx + leaq 1(%rcx), %r9 + leaq 1(%rdx), %rsi + addq $2, %rdx + cmpq %rsi, %r8 + jbe .L165 + movq (%rdi,%rcx,8), %r11 + vpbroadcastq %r11, %xmm1 + jmp .L158 + .p2align 4,,10 + .p2align 3 +.L168: + movq %rsi, %rax + cmpq %rdx, %r8 + ja .L166 +.L160: + cmpq %rcx, %rax + je .L165 + leaq (%rdi,%rax,8), %rdx + movq (%rdx), %rcx + movq %rcx, (%r10) + movq %r11, (%rdx) + cmpq %rax, %r8 + jbe .L167 + leaq (%rax,%rax), %rdx + leaq 1(%rax), %r9 + leaq 1(%rdx), %rsi + addq $2, %rdx + cmpq %r8, %rsi + jnb .L165 + movq %rax, %rcx +.L158: + vpbroadcastq (%rdi,%rsi,8), %xmm0 + leaq (%rdi,%rcx,8), %r10 + vpcmpq $6, %xmm1, %xmm0, %k0 + kmovb %k0, %eax + testb $1, %al + jne .L168 + cmpq %rdx, %r8 + jbe .L165 + salq $4, %r9 + vpbroadcastq (%rdi,%r9), %xmm0 + vpcmpq $6, %xmm1, %xmm0, %k1 + kmovb %k1, %eax + testb $1, %al + je .L165 + movq %rdx, %rax + jmp .L160 + .p2align 4,,10 + .p2align 3 +.L165: + ret + .p2align 4,,10 + .p2align 3 +.L166: + salq $4, %r9 + vpbroadcastq (%rdi,%r9), %xmm2 + vpcmpq $6, %xmm0, %xmm2, %k2 + kmovb %k2, %esi + andl $1, %esi + cmovne %rdx, %rax + jmp .L160 + .p2align 4,,10 + .p2align 3 +.L167: + ret + .cfi_endproc +.LFE18787: + .size _ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0, .-_ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + .section .text._ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0, @function +_ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0: +.LFB18788: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + leaq 0(,%rsi,8), %r15 + leaq (%rdi,%r15), %rdx + pushq %r14 + .cfi_offset 14, -32 + leaq (%rdx,%r15), %r14 + pushq %r13 + vmovq %rdx, %xmm18 + .cfi_offset 13, -40 + leaq (%r14,%r15), %r13 + pushq %r12 + .cfi_offset 12, -48 + leaq 0(%r13,%r15), %r12 + pushq %rbx + .cfi_offset 3, -56 + leaq (%r12,%r15), %rbx + leaq (%rbx,%r15), %r11 + leaq (%r11,%r15), %r10 + andq $-64, %rsp + leaq (%r10,%r15), %r9 + movq %rsi, -8(%rsp) + leaq (%r9,%r15), %r8 + vmovdqu64 (%rdi), %zmm4 + vmovdqu64 (%r9), %zmm5 + leaq (%r8,%r15), %rdx + vpminsq (%r8), %zmm5, %zmm8 + vmovdqu64 (%r11), %zmm1 + leaq (%rdx,%r15), %rsi + vmovq %rdx, %xmm23 + vpmaxsq (%r8), %zmm5, %zmm6 + leaq (%rsi,%r15), %rcx + leaq (%rcx,%r15), %rdx + leaq (%rdx,%r15), %rax + addq %rax, %r15 + movq %r15, -16(%rsp) + vmovq %xmm18, %r15 + vpminsq (%r15), %zmm4, %zmm13 + vpmaxsq (%r15), %zmm4, %zmm7 + vmovq %xmm23, %r15 + vmovdqu64 (%r14), %zmm4 + vmovdqu64 (%r15), %zmm5 + vpminsq 0(%r13), %zmm4, %zmm14 + vpmaxsq 0(%r13), %zmm4, %zmm3 + vpminsq (%rsi), %zmm5, %zmm9 + vpmaxsq (%rsi), %zmm5, %zmm15 + vmovdqu64 (%r12), %zmm4 + vmovdqu64 (%rcx), %zmm5 + vpminsq %zmm14, %zmm13, %zmm10 + vpmaxsq %zmm14, %zmm13, %zmm13 + vpminsq (%rbx), %zmm4, %zmm12 + vpmaxsq (%rbx), %zmm4, %zmm2 + vpminsq (%rdx), %zmm5, %zmm0 + vpminsq (%r10), %zmm1, %zmm4 + vpmaxsq (%rdx), %zmm5, %zmm16 + vpmaxsq (%r10), %zmm1, %zmm1 + vmovdqu64 (%rax), %zmm5 + movq -16(%rsp), %r15 + vpminsq %zmm3, %zmm7, %zmm14 + vpmaxsq %zmm3, %zmm7, %zmm7 + vpminsq %zmm4, %zmm12, %zmm3 + vpmaxsq %zmm4, %zmm12, %zmm12 + cmpq $1, -8(%rsp) + vpminsq (%r15), %zmm5, %zmm11 + vpmaxsq (%r15), %zmm5, %zmm5 + vpminsq %zmm1, %zmm2, %zmm4 + vpmaxsq %zmm1, %zmm2, %zmm2 + vpminsq %zmm9, %zmm8, %zmm1 + vpmaxsq %zmm9, %zmm8, %zmm8 + vpminsq %zmm15, %zmm6, %zmm9 + vpmaxsq %zmm15, %zmm6, %zmm6 + vpminsq %zmm11, %zmm0, %zmm15 + vpmaxsq %zmm11, %zmm0, %zmm0 + vpminsq %zmm5, %zmm16, %zmm11 + vpmaxsq %zmm5, %zmm16, %zmm16 + vpminsq %zmm3, %zmm10, %zmm5 + vpmaxsq %zmm3, %zmm10, %zmm10 + vpminsq %zmm4, %zmm14, %zmm3 + vpmaxsq %zmm4, %zmm14, %zmm14 + vpminsq %zmm12, %zmm13, %zmm4 + vpmaxsq %zmm12, %zmm13, %zmm13 + vpminsq %zmm2, %zmm7, %zmm12 + vpmaxsq %zmm2, %zmm7, %zmm7 + vpminsq %zmm15, %zmm1, %zmm2 + vpmaxsq %zmm15, %zmm1, %zmm1 + vpminsq %zmm11, %zmm9, %zmm15 + vpmaxsq %zmm11, %zmm9, %zmm9 + vpminsq %zmm0, %zmm8, %zmm11 + vpmaxsq %zmm0, %zmm8, %zmm8 + vpminsq %zmm16, %zmm6, %zmm0 + vpmaxsq %zmm16, %zmm6, %zmm6 + vpminsq %zmm2, %zmm5, %zmm17 + vpmaxsq %zmm2, %zmm5, %zmm5 + vpminsq %zmm15, %zmm3, %zmm2 + vpmaxsq %zmm15, %zmm3, %zmm3 + vpminsq %zmm11, %zmm4, %zmm15 + vpmaxsq %zmm11, %zmm4, %zmm4 + vpminsq %zmm0, %zmm12, %zmm11 + vpmaxsq %zmm0, %zmm12, %zmm12 + vpminsq %zmm1, %zmm10, %zmm0 + vpmaxsq %zmm1, %zmm10, %zmm10 + vpminsq %zmm9, %zmm14, %zmm1 + vpmaxsq %zmm9, %zmm14, %zmm14 + vpminsq %zmm8, %zmm13, %zmm9 + vpmaxsq %zmm8, %zmm13, %zmm13 + vpminsq %zmm6, %zmm7, %zmm8 + vpmaxsq %zmm6, %zmm7, %zmm7 + vpminsq %zmm4, %zmm1, %zmm6 + vpmaxsq %zmm4, %zmm1, %zmm1 + vpminsq %zmm3, %zmm9, %zmm4 + vpmaxsq %zmm3, %zmm9, %zmm9 + vpminsq %zmm10, %zmm11, %zmm3 + vpmaxsq %zmm10, %zmm11, %zmm11 + vpminsq %zmm12, %zmm8, %zmm10 + vpmaxsq %zmm12, %zmm8, %zmm8 + vpminsq %zmm13, %zmm14, %zmm12 + vpmaxsq %zmm13, %zmm14, %zmm14 + vpminsq %zmm5, %zmm0, %zmm13 + vpmaxsq %zmm5, %zmm0, %zmm0 + vpminsq %zmm15, %zmm2, %zmm5 + vpmaxsq %zmm15, %zmm2, %zmm2 + vpminsq %zmm13, %zmm5, %zmm16 + vpmaxsq %zmm13, %zmm5, %zmm5 + vpminsq %zmm12, %zmm10, %zmm13 + vpmaxsq %zmm12, %zmm10, %zmm10 + vpminsq %zmm0, %zmm2, %zmm12 + vpmaxsq %zmm0, %zmm2, %zmm2 + vpminsq %zmm14, %zmm8, %zmm0 + vpminsq %zmm5, %zmm12, %zmm15 + vpmaxsq %zmm5, %zmm12, %zmm12 + vpminsq %zmm4, %zmm6, %zmm5 + vpmaxsq %zmm4, %zmm6, %zmm6 + vpminsq %zmm1, %zmm9, %zmm4 + vpmaxsq %zmm1, %zmm9, %zmm9 + vpminsq %zmm10, %zmm0, %zmm1 + vpmaxsq %zmm10, %zmm0, %zmm0 + vpminsq %zmm2, %zmm3, %zmm10 + vpmaxsq %zmm2, %zmm3, %zmm3 + vpminsq %zmm11, %zmm13, %zmm2 + vpmaxsq %zmm11, %zmm13, %zmm13 + vpminsq %zmm5, %zmm10, %zmm11 + vpmaxsq %zmm5, %zmm10, %zmm10 + vpminsq %zmm3, %zmm6, %zmm5 + vpmaxsq %zmm3, %zmm6, %zmm6 + vpminsq %zmm4, %zmm2, %zmm3 + vpmaxsq %zmm4, %zmm2, %zmm2 + vpminsq %zmm13, %zmm9, %zmm4 + vpmaxsq %zmm13, %zmm9, %zmm9 + vpminsq %zmm5, %zmm10, %zmm13 + vpmaxsq %zmm5, %zmm10, %zmm10 + vpminsq %zmm6, %zmm3, %zmm5 + vpmaxsq %zmm6, %zmm3, %zmm3 + vpminsq %zmm4, %zmm2, %zmm6 + vpmaxsq %zmm14, %zmm8, %zmm8 + vpmaxsq %zmm4, %zmm2, %zmm2 + vpminsq %zmm12, %zmm11, %zmm14 + vpminsq %zmm9, %zmm1, %zmm4 + vpmaxsq %zmm12, %zmm11, %zmm11 + vpmaxsq %zmm9, %zmm1, %zmm1 + vpminsq %zmm5, %zmm10, %zmm12 + vpmaxsq %zmm5, %zmm10, %zmm10 + vpminsq %zmm6, %zmm3, %zmm5 + vpmaxsq %zmm6, %zmm3, %zmm3 + jbe .L170 + vpshufd $78, %zmm0, %zmm6 + vpshufd $78, %zmm7, %zmm7 + vpshufd $78, %zmm5, %zmm5 + movl $85, %r15d + vpshufd $78, %zmm3, %zmm3 + vpshufd $78, %zmm2, %zmm2 + vpshufd $78, %zmm4, %zmm4 + kmovb %r15d, %k1 + vpshufd $78, %zmm1, %zmm1 + vpshufd $78, %zmm8, %zmm8 + vpminsq %zmm7, %zmm17, %zmm0 + cmpq $3, -8(%rsp) + vpmaxsq %zmm7, %zmm17, %zmm9 + vpminsq %zmm8, %zmm16, %zmm17 + vpminsq %zmm2, %zmm13, %zmm7 + vpmaxsq %zmm8, %zmm16, %zmm8 + vpshufd $78, %zmm9, %zmm9 + vpminsq %zmm6, %zmm15, %zmm16 + vpmaxsq %zmm6, %zmm15, %zmm6 + vpshufd $78, %zmm8, %zmm8 + vpminsq %zmm1, %zmm14, %zmm15 + vpmaxsq %zmm1, %zmm14, %zmm1 + vpshufd $78, %zmm7, %zmm7 + vpminsq %zmm4, %zmm11, %zmm14 + vpmaxsq %zmm4, %zmm11, %zmm4 + vpshufd $78, %zmm1, %zmm1 + vpminsq %zmm3, %zmm12, %zmm11 + vpmaxsq %zmm3, %zmm12, %zmm3 + vpshufd $78, %zmm14, %zmm14 + vpminsq %zmm5, %zmm10, %zmm12 + vpmaxsq %zmm5, %zmm10, %zmm5 + vpshufd $78, %zmm11, %zmm10 + vpshufd $78, %zmm12, %zmm12 + vpmaxsq %zmm2, %zmm13, %zmm2 + vpmaxsq %zmm9, %zmm5, %zmm19 + vpminsq %zmm12, %zmm0, %zmm11 + vpmaxsq %zmm12, %zmm0, %zmm13 + vpshufd $78, %zmm6, %zmm6 + vpminsq %zmm9, %zmm5, %zmm0 + vpminsq %zmm14, %zmm15, %zmm12 + vpminsq %zmm8, %zmm3, %zmm9 + vpminsq %zmm10, %zmm17, %zmm5 + vpshufd $78, %zmm12, %zmm12 + vpmaxsq %zmm10, %zmm17, %zmm10 + vpmaxsq %zmm8, %zmm3, %zmm17 + vpminsq %zmm7, %zmm16, %zmm3 + vpminsq %zmm6, %zmm2, %zmm8 + vpshufd $78, %zmm10, %zmm10 + vpmaxsq %zmm6, %zmm2, %zmm6 + vpmaxsq %zmm14, %zmm15, %zmm14 + vpshufd $78, %zmm3, %zmm2 + vpminsq %zmm1, %zmm4, %zmm15 + vpshufd $78, %zmm17, %zmm3 + vpmaxsq %zmm1, %zmm4, %zmm4 + vpmaxsq %zmm12, %zmm11, %zmm17 + vpmaxsq %zmm7, %zmm16, %zmm7 + vpshufd $78, %zmm8, %zmm1 + vpshufd $78, %zmm13, %zmm16 + vpshufd $78, %zmm15, %zmm8 + vpminsq %zmm12, %zmm11, %zmm13 + vpshufd $78, %zmm19, %zmm15 + vpminsq %zmm2, %zmm5, %zmm11 + vpminsq %zmm16, %zmm14, %zmm12 + vpminsq %zmm8, %zmm0, %zmm19 + vpmaxsq %zmm16, %zmm14, %zmm14 + vpshufd $78, %zmm11, %zmm11 + vpmaxsq %zmm8, %zmm0, %zmm16 + vpminsq %zmm1, %zmm9, %zmm0 + vpminsq %zmm15, %zmm4, %zmm8 + vpmaxsq %zmm15, %zmm4, %zmm15 + vpshufd $78, %zmm0, %zmm4 + vpmaxsq %zmm2, %zmm5, %zmm5 + vpshufd $78, %zmm16, %zmm0 + vpminsq %zmm10, %zmm7, %zmm2 + vpshufd $78, %zmm15, %zmm16 + vpmaxsq %zmm10, %zmm7, %zmm7 + vpminsq %zmm11, %zmm13, %zmm15 + vpshufd $78, %zmm17, %zmm10 + vpmaxsq %zmm1, %zmm9, %zmm1 + vpminsq %zmm3, %zmm6, %zmm9 + vpmaxsq %zmm3, %zmm6, %zmm6 + vpshufd $78, %zmm14, %zmm3 + vpmaxsq %zmm11, %zmm13, %zmm14 + vpminsq %zmm10, %zmm5, %zmm11 + vpmaxsq %zmm10, %zmm5, %zmm5 + vpshufd $78, %zmm15, %zmm10 + vpmaxsq %zmm10, %zmm15, %zmm17 + vpshufd $78, %zmm2, %zmm2 + vpshufd $78, %zmm9, %zmm9 + vpminsq %zmm10, %zmm15, %zmm17{%k1} + vpshufd $78, %zmm14, %zmm10 + vpminsq %zmm2, %zmm12, %zmm13 + vpmaxsq %zmm2, %zmm12, %zmm12 + vpminsq %zmm3, %zmm7, %zmm2 + vpmaxsq %zmm3, %zmm7, %zmm7 + vpminsq %zmm4, %zmm19, %zmm3 + vpmaxsq %zmm4, %zmm19, %zmm19 + vpminsq %zmm0, %zmm1, %zmm4 + vpmaxsq %zmm0, %zmm1, %zmm1 + vpminsq %zmm9, %zmm8, %zmm0 + vpmaxsq %zmm9, %zmm8, %zmm8 + vpminsq %zmm16, %zmm6, %zmm9 + vpmaxsq %zmm16, %zmm6, %zmm6 + vpmaxsq %zmm10, %zmm14, %zmm16 + vpminsq %zmm10, %zmm14, %zmm16{%k1} + vpshufd $78, %zmm11, %zmm10 + vpmaxsq %zmm10, %zmm11, %zmm15 + vpminsq %zmm10, %zmm11, %zmm15{%k1} + vpshufd $78, %zmm5, %zmm10 + vpmaxsq %zmm10, %zmm5, %zmm14 + vpminsq %zmm10, %zmm5, %zmm14{%k1} + vpshufd $78, %zmm13, %zmm5 + vpmaxsq %zmm5, %zmm13, %zmm11 + vpminsq %zmm5, %zmm13, %zmm11{%k1} + vpshufd $78, %zmm12, %zmm5 + vpmaxsq %zmm5, %zmm12, %zmm13 + vpminsq %zmm5, %zmm12, %zmm13{%k1} + vpshufd $78, %zmm2, %zmm5 + vpmaxsq %zmm5, %zmm2, %zmm12 + vpminsq %zmm5, %zmm2, %zmm12{%k1} + vpshufd $78, %zmm7, %zmm2 + vpmaxsq %zmm2, %zmm7, %zmm10 + vpminsq %zmm2, %zmm7, %zmm10{%k1} + vpshufd $78, %zmm3, %zmm2 + vpshufd $78, %zmm4, %zmm7 + vpmaxsq %zmm2, %zmm3, %zmm5 + vpminsq %zmm2, %zmm3, %zmm5{%k1} + vpshufd $78, %zmm19, %zmm2 + vpmaxsq %zmm2, %zmm19, %zmm3 + vpminsq %zmm2, %zmm19, %zmm3{%k1} + vpmaxsq %zmm7, %zmm4, %zmm2 + vpminsq %zmm7, %zmm4, %zmm2{%k1} + vpshufd $78, %zmm1, %zmm7 + vpmaxsq %zmm7, %zmm1, %zmm4 + vpminsq %zmm7, %zmm1, %zmm4{%k1} + vpshufd $78, %zmm0, %zmm7 + vpmaxsq %zmm7, %zmm0, %zmm1 + vpminsq %zmm7, %zmm0, %zmm1{%k1} + vpshufd $78, %zmm8, %zmm7 + vpmaxsq %zmm7, %zmm8, %zmm0 + vpminsq %zmm7, %zmm8, %zmm0{%k1} + vpshufd $78, %zmm9, %zmm7 + vpmaxsq %zmm7, %zmm9, %zmm8 + vpminsq %zmm7, %zmm9, %zmm8{%k1} + vpshufd $78, %zmm6, %zmm9 + vpmaxsq %zmm9, %zmm6, %zmm7 + vpminsq %zmm9, %zmm6, %zmm7{%k1} + jbe .L170 + vpermq $27, %zmm5, %zmm5 + vpermq $27, %zmm3, %zmm3 + vpermq $27, %zmm2, %zmm2 + movl $51, %r15d + vpermq $27, %zmm4, %zmm4 + vpermq $27, %zmm1, %zmm1 + kmovb %r15d, %k2 + vpermq $27, %zmm0, %zmm0 + vpermq $27, %zmm8, %zmm8 + vpermq $27, %zmm7, %zmm7 + vpminsq %zmm2, %zmm13, %zmm6 + cmpq $7, -8(%rsp) + vpminsq %zmm7, %zmm17, %zmm9 + vpmaxsq %zmm7, %zmm17, %zmm7 + vpermq $27, %zmm6, %zmm6 + vpminsq %zmm8, %zmm16, %zmm17 + vpmaxsq %zmm8, %zmm16, %zmm8 + vpermq $27, %zmm7, %zmm7 + vpminsq %zmm0, %zmm15, %zmm16 + vpmaxsq %zmm0, %zmm15, %zmm0 + vpermq $27, %zmm8, %zmm8 + vpminsq %zmm1, %zmm14, %zmm15 + vpmaxsq %zmm1, %zmm14, %zmm1 + vpermq $27, %zmm0, %zmm0 + vpminsq %zmm4, %zmm11, %zmm14 + vpmaxsq %zmm4, %zmm11, %zmm4 + vpermq $27, %zmm1, %zmm1 + vpminsq %zmm3, %zmm12, %zmm11 + vpmaxsq %zmm3, %zmm12, %zmm3 + vpermq $27, %zmm14, %zmm14 + vpminsq %zmm5, %zmm10, %zmm12 + vpmaxsq %zmm5, %zmm10, %zmm5 + vpermq $27, %zmm11, %zmm10 + vpermq $27, %zmm12, %zmm12 + vpminsq %zmm7, %zmm5, %zmm11 + vpmaxsq %zmm2, %zmm13, %zmm2 + vpminsq %zmm12, %zmm9, %zmm19 + vpmaxsq %zmm12, %zmm9, %zmm13 + vpmaxsq %zmm7, %zmm5, %zmm20 + vpminsq %zmm8, %zmm3, %zmm9 + vpminsq %zmm10, %zmm17, %zmm5 + vpmaxsq %zmm10, %zmm17, %zmm10 + vpmaxsq %zmm8, %zmm3, %zmm17 + vpminsq %zmm6, %zmm16, %zmm3 + vpermq $27, %zmm10, %zmm10 + vpminsq %zmm14, %zmm15, %zmm8 + vpminsq %zmm0, %zmm2, %zmm7 + vpmaxsq %zmm14, %zmm15, %zmm14 + vpmaxsq %zmm0, %zmm2, %zmm0 + vpermq $27, %zmm3, %zmm2 + vpminsq %zmm1, %zmm4, %zmm15 + vpmaxsq %zmm1, %zmm4, %zmm12 + vpermq $27, %zmm8, %zmm3 + vpmaxsq %zmm6, %zmm16, %zmm6 + vpermq $27, %zmm13, %zmm1 + vpermq $27, %zmm15, %zmm16 + vpermq $27, %zmm17, %zmm8 + vpminsq %zmm2, %zmm5, %zmm17 + vpminsq %zmm3, %zmm19, %zmm13 + vpminsq %zmm16, %zmm11, %zmm4 + vpermq $27, %zmm7, %zmm7 + vpermq $27, %zmm20, %zmm15 + vpmaxsq %zmm3, %zmm19, %zmm19 + vpmaxsq %zmm16, %zmm11, %zmm20 + vpermq $27, %zmm17, %zmm11 + vpmaxsq %zmm2, %zmm5, %zmm3 + vpminsq %zmm1, %zmm14, %zmm5 + vpmaxsq %zmm1, %zmm14, %zmm14 + vpminsq %zmm7, %zmm9, %zmm16 + vpmaxsq %zmm10, %zmm6, %zmm1 + vpminsq %zmm10, %zmm6, %zmm2 + vpermq $27, %zmm19, %zmm10 + vpermq $27, %zmm14, %zmm6 + vpmaxsq %zmm7, %zmm9, %zmm9 + vpminsq %zmm11, %zmm13, %zmm14 + vpminsq %zmm15, %zmm12, %zmm7 + vpmaxsq %zmm15, %zmm12, %zmm15 + vpermq $27, %zmm16, %zmm17 + vpminsq %zmm8, %zmm0, %zmm12 + vpmaxsq %zmm11, %zmm13, %zmm13 + vpermq $27, %zmm2, %zmm2 + vpminsq %zmm10, %zmm3, %zmm11 + vpermq $27, %zmm12, %zmm16 + vpmaxsq %zmm10, %zmm3, %zmm3 + vpmaxsq %zmm8, %zmm0, %zmm0 + vpermq $27, %zmm14, %zmm10 + vpermq $27, %zmm20, %zmm8 + vpminsq %zmm17, %zmm4, %zmm19 + vpmaxsq %zmm17, %zmm4, %zmm4 + vpermq $27, %zmm15, %zmm15 + vpminsq %zmm8, %zmm9, %zmm17 + vpmaxsq %zmm8, %zmm9, %zmm9 + vpminsq %zmm16, %zmm7, %zmm8 + vpmaxsq %zmm16, %zmm7, %zmm7 + vpmaxsq %zmm10, %zmm14, %zmm16 + vpminsq %zmm2, %zmm5, %zmm12 + vpminsq %zmm10, %zmm14, %zmm16{%k2} + vpermq $27, %zmm13, %zmm10 + vpmaxsq %zmm2, %zmm5, %zmm5 + vpminsq %zmm6, %zmm1, %zmm2 + vpmaxsq %zmm6, %zmm1, %zmm1 + vpminsq %zmm15, %zmm0, %zmm6 + vpmaxsq %zmm15, %zmm0, %zmm0 + vpmaxsq %zmm10, %zmm13, %zmm15 + vpminsq %zmm10, %zmm13, %zmm15{%k2} + vpermq $27, %zmm11, %zmm10 + vpmaxsq %zmm10, %zmm11, %zmm14 + vpminsq %zmm10, %zmm11, %zmm14{%k2} + vpermq $27, %zmm3, %zmm10 + vpmaxsq %zmm10, %zmm3, %zmm11 + vpminsq %zmm10, %zmm3, %zmm11{%k2} + vpermq $27, %zmm12, %zmm3 + vpmaxsq %zmm3, %zmm12, %zmm13 + vpminsq %zmm3, %zmm12, %zmm13{%k2} + vpermq $27, %zmm5, %zmm3 + vpmaxsq %zmm3, %zmm5, %zmm12 + vpminsq %zmm3, %zmm5, %zmm12{%k2} + vpermq $27, %zmm2, %zmm3 + vpmaxsq %zmm3, %zmm2, %zmm10 + vpminsq %zmm3, %zmm2, %zmm10{%k2} + vpermq $27, %zmm1, %zmm2 + vpmaxsq %zmm2, %zmm1, %zmm5 + vpminsq %zmm2, %zmm1, %zmm5{%k2} + vpermq $27, %zmm19, %zmm1 + vpmaxsq %zmm1, %zmm19, %zmm3 + vpminsq %zmm1, %zmm19, %zmm3{%k2} + vpermq $27, %zmm4, %zmm1 + vpmaxsq %zmm1, %zmm4, %zmm2 + vpminsq %zmm1, %zmm4, %zmm2{%k2} + vpermq $27, %zmm17, %zmm1 + vpmaxsq %zmm1, %zmm17, %zmm4 + vpminsq %zmm1, %zmm17, %zmm4{%k2} + vpermq $27, %zmm9, %zmm17 + vpmaxsq %zmm17, %zmm9, %zmm1 + vpminsq %zmm17, %zmm9, %zmm1{%k2} + vpermq $27, %zmm8, %zmm17 + vpmaxsq %zmm17, %zmm8, %zmm9 + vpminsq %zmm17, %zmm8, %zmm9{%k2} + vpermq $27, %zmm7, %zmm17 + vpmaxsq %zmm17, %zmm7, %zmm8 + vpminsq %zmm17, %zmm7, %zmm8{%k2} + vpermq $27, %zmm6, %zmm17 + vpmaxsq %zmm17, %zmm6, %zmm7 + vpminsq %zmm17, %zmm6, %zmm7{%k2} + vpermq $27, %zmm0, %zmm17 + vpmaxsq %zmm17, %zmm0, %zmm6 + vpminsq %zmm17, %zmm0, %zmm6{%k2} + vpshufd $78, %zmm16, %zmm0 + vpmaxsq %zmm0, %zmm16, %zmm17 + vpminsq %zmm0, %zmm16, %zmm17{%k1} + vpshufd $78, %zmm15, %zmm0 + vpmaxsq %zmm0, %zmm15, %zmm16 + vpminsq %zmm0, %zmm15, %zmm16{%k1} + vpshufd $78, %zmm14, %zmm0 + vpmaxsq %zmm0, %zmm14, %zmm15 + vpminsq %zmm0, %zmm14, %zmm15{%k1} + vpshufd $78, %zmm11, %zmm0 + vpmaxsq %zmm0, %zmm11, %zmm14 + vpminsq %zmm0, %zmm11, %zmm14{%k1} + vpshufd $78, %zmm13, %zmm0 + vpmaxsq %zmm0, %zmm13, %zmm11 + vpminsq %zmm0, %zmm13, %zmm11{%k1} + vpshufd $78, %zmm12, %zmm0 + vpmaxsq %zmm0, %zmm12, %zmm13 + vpminsq %zmm0, %zmm12, %zmm13{%k1} + vpshufd $78, %zmm10, %zmm0 + vpmaxsq %zmm0, %zmm10, %zmm12 + vpminsq %zmm0, %zmm10, %zmm12{%k1} + vpshufd $78, %zmm5, %zmm0 + vpmaxsq %zmm0, %zmm5, %zmm10 + vpminsq %zmm0, %zmm5, %zmm10{%k1} + vpshufd $78, %zmm3, %zmm0 + vpmaxsq %zmm0, %zmm3, %zmm5 + vpminsq %zmm0, %zmm3, %zmm5{%k1} + vpshufd $78, %zmm2, %zmm0 + vpmaxsq %zmm0, %zmm2, %zmm3 + vpminsq %zmm0, %zmm2, %zmm3{%k1} + vpshufd $78, %zmm4, %zmm0 + vpmaxsq %zmm0, %zmm4, %zmm2 + vpminsq %zmm0, %zmm4, %zmm2{%k1} + vpshufd $78, %zmm1, %zmm0 + vpmaxsq %zmm0, %zmm1, %zmm4 + vpminsq %zmm0, %zmm1, %zmm4{%k1} + vpshufd $78, %zmm9, %zmm0 + vpmaxsq %zmm0, %zmm9, %zmm1 + vpminsq %zmm0, %zmm9, %zmm1{%k1} + vpshufd $78, %zmm8, %zmm9 + vpmaxsq %zmm9, %zmm8, %zmm0 + vpminsq %zmm9, %zmm8, %zmm0{%k1} + vpshufd $78, %zmm7, %zmm9 + vpmaxsq %zmm9, %zmm7, %zmm8 + vpminsq %zmm9, %zmm7, %zmm8{%k1} + vpshufd $78, %zmm6, %zmm9 + vpmaxsq %zmm9, %zmm6, %zmm7 + vpminsq %zmm9, %zmm6, %zmm7{%k1} + jbe .L170 + vmovdqa64 .LC2(%rip), %zmm6 + movl $65535, %r15d + kmovd %r15d, %k3 + vpermq %zmm7, %zmm6, %zmm7 + vpermq %zmm5, %zmm6, %zmm5 + vpermq %zmm3, %zmm6, %zmm3 + vpermq %zmm2, %zmm6, %zmm2 + vpermq %zmm4, %zmm6, %zmm4 + vpermq %zmm1, %zmm6, %zmm1 + vpermq %zmm0, %zmm6, %zmm0 + vpermq %zmm8, %zmm6, %zmm8 + vpminsq %zmm7, %zmm17, %zmm9 + vpmaxsq %zmm7, %zmm17, %zmm19 + vpminsq %zmm8, %zmm16, %zmm7 + vpmaxsq %zmm8, %zmm16, %zmm8 + vpminsq %zmm0, %zmm15, %zmm16 + vpmaxsq %zmm0, %zmm15, %zmm0 + vpminsq %zmm1, %zmm14, %zmm15 + vpermq %zmm8, %zmm6, %zmm8 + vpmaxsq %zmm1, %zmm14, %zmm1 + vpminsq %zmm4, %zmm11, %zmm14 + vpermq %zmm0, %zmm6, %zmm0 + vpmaxsq %zmm4, %zmm11, %zmm4 + vpminsq %zmm2, %zmm13, %zmm11 + vpermq %zmm14, %zmm6, %zmm14 + vpmaxsq %zmm2, %zmm13, %zmm2 + vpminsq %zmm3, %zmm12, %zmm13 + vpermq %zmm11, %zmm6, %zmm11 + vpmaxsq %zmm3, %zmm12, %zmm3 + vpminsq %zmm5, %zmm10, %zmm12 + vpermq %zmm13, %zmm6, %zmm17 + vpmaxsq %zmm5, %zmm10, %zmm5 + vpermq %zmm19, %zmm6, %zmm13 + vpermq %zmm12, %zmm6, %zmm12 + vpminsq %zmm12, %zmm9, %zmm10 + vpmaxsq %zmm12, %zmm9, %zmm19 + vpermq %zmm1, %zmm6, %zmm1 + vpminsq %zmm13, %zmm5, %zmm12 + vpmaxsq %zmm13, %zmm5, %zmm20 + vpminsq %zmm8, %zmm3, %zmm9 + vpminsq %zmm17, %zmm7, %zmm13 + vpminsq %zmm11, %zmm16, %zmm5 + vpmaxsq %zmm17, %zmm7, %zmm7 + vpmaxsq %zmm8, %zmm3, %zmm17 + vpmaxsq %zmm11, %zmm16, %zmm3 + vpermq %zmm19, %zmm6, %zmm11 + vpminsq %zmm14, %zmm15, %zmm16 + vpminsq %zmm0, %zmm2, %zmm8 + vpmaxsq %zmm14, %zmm15, %zmm14 + vpermq %zmm16, %zmm6, %zmm16 + vpermq %zmm5, %zmm6, %zmm15 + vpmaxsq %zmm0, %zmm2, %zmm0 + vpminsq %zmm1, %zmm4, %zmm2 + vpermq %zmm20, %zmm6, %zmm5 + vpminsq %zmm16, %zmm10, %zmm19 + vpmaxsq %zmm1, %zmm4, %zmm1 + vpermq %zmm7, %zmm6, %zmm4 + vpermq %zmm8, %zmm6, %zmm7 + vpermq %zmm2, %zmm6, %zmm8 + vpermq %zmm17, %zmm6, %zmm2 + vpmaxsq %zmm16, %zmm10, %zmm17 + vpminsq %zmm15, %zmm13, %zmm16 + vpminsq %zmm11, %zmm14, %zmm10 + vpmaxsq %zmm15, %zmm13, %zmm13 + vpmaxsq %zmm8, %zmm12, %zmm20 + vpmaxsq %zmm11, %zmm14, %zmm15 + vpermq %zmm16, %zmm6, %zmm11 + vpminsq %zmm4, %zmm3, %zmm14 + vpmaxsq %zmm5, %zmm1, %zmm22 + vpmaxsq %zmm4, %zmm3, %zmm3 + vpminsq %zmm2, %zmm0, %zmm21 + vpermq %zmm22, %zmm6, %zmm16 + vpminsq %zmm8, %zmm12, %zmm4 + vpminsq %zmm7, %zmm9, %zmm8 + vpermq %zmm17, %zmm6, %zmm12 + vpmaxsq %zmm7, %zmm9, %zmm9 + vpermq %zmm21, %zmm6, %zmm17 + vpminsq %zmm5, %zmm1, %zmm7 + vpmaxsq %zmm2, %zmm0, %zmm0 + vpermq %zmm14, %zmm6, %zmm5 + vpermq %zmm15, %zmm6, %zmm2 + vpermq %zmm8, %zmm6, %zmm1 + vpminsq %zmm11, %zmm19, %zmm15 + vpermq %zmm20, %zmm6, %zmm8 + vpmaxsq %zmm11, %zmm19, %zmm14 + vpminsq %zmm12, %zmm13, %zmm11 + vpmaxsq %zmm12, %zmm13, %zmm13 + vpminsq %zmm5, %zmm10, %zmm12 + vpmaxsq %zmm5, %zmm10, %zmm10 + vpminsq %zmm2, %zmm3, %zmm5 + vpmaxsq %zmm2, %zmm3, %zmm3 + vpminsq %zmm1, %zmm4, %zmm2 + vpmaxsq %zmm1, %zmm4, %zmm4 + vpminsq %zmm8, %zmm9, %zmm1 + vpmaxsq %zmm8, %zmm9, %zmm9 + vpminsq %zmm17, %zmm7, %zmm8 + vpmaxsq %zmm17, %zmm7, %zmm7 + vpminsq %zmm16, %zmm0, %zmm17 + vpmaxsq %zmm16, %zmm0, %zmm0 + vpermq %zmm15, %zmm6, %zmm16 + vpminsq %zmm16, %zmm15, %zmm19 + vpmaxsq %zmm16, %zmm15, %zmm15 + vpermq %zmm14, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm15{%k3} + vpminsq %zmm16, %zmm14, %zmm19 + vpmaxsq %zmm16, %zmm14, %zmm14 + vpermq %zmm11, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm14{%k3} + vpminsq %zmm16, %zmm11, %zmm19 + vpmaxsq %zmm16, %zmm11, %zmm11 + vpermq %zmm13, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm11{%k3} + vpminsq %zmm16, %zmm13, %zmm19 + vpmaxsq %zmm16, %zmm13, %zmm13 + vpermq %zmm12, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm13{%k3} + vpminsq %zmm16, %zmm12, %zmm19 + vpmaxsq %zmm16, %zmm12, %zmm12 + vpermq %zmm10, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm12{%k3} + vpminsq %zmm16, %zmm10, %zmm19 + vpmaxsq %zmm16, %zmm10, %zmm10 + vpermq %zmm5, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm10{%k3} + vpminsq %zmm16, %zmm5, %zmm19 + vpmaxsq %zmm16, %zmm5, %zmm5 + vpermq %zmm3, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm5{%k3} + vpminsq %zmm16, %zmm3, %zmm19 + vpmaxsq %zmm16, %zmm3, %zmm3 + vpermq %zmm2, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm3{%k3} + vpminsq %zmm16, %zmm2, %zmm19 + vpmaxsq %zmm16, %zmm2, %zmm2 + vpermq %zmm4, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm2{%k3} + vpminsq %zmm16, %zmm4, %zmm19 + vpmaxsq %zmm16, %zmm4, %zmm4 + vpermq %zmm1, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm4{%k3} + vpminsq %zmm16, %zmm1, %zmm19 + vpmaxsq %zmm16, %zmm1, %zmm1 + vpermq %zmm9, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm1{%k3} + vpminsq %zmm16, %zmm9, %zmm19 + vpmaxsq %zmm16, %zmm9, %zmm9 + vpermq %zmm8, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm9{%k3} + vpminsq %zmm16, %zmm8, %zmm19 + vpmaxsq %zmm16, %zmm8, %zmm8 + vpermq %zmm7, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm8{%k3} + vpminsq %zmm16, %zmm7, %zmm19 + vpmaxsq %zmm16, %zmm7, %zmm7 + vpermq %zmm17, %zmm6, %zmm16 + vpermq %zmm0, %zmm6, %zmm6 + vmovdqu16 %zmm19, %zmm7{%k3} + vpminsq %zmm16, %zmm17, %zmm19 + vpmaxsq %zmm16, %zmm17, %zmm17 + vpminsq %zmm6, %zmm0, %zmm16 + vpmaxsq %zmm6, %zmm0, %zmm0 + vshufi32x4 $177, %zmm15, %zmm15, %zmm6 + vmovdqu16 %zmm16, %zmm0{%k3} + vpmaxsq %zmm15, %zmm6, %zmm16 + vmovdqu16 %zmm19, %zmm17{%k3} + vpminsq %zmm15, %zmm6, %zmm16{%k2} + vshufi32x4 $177, %zmm14, %zmm14, %zmm6 + vpmaxsq %zmm14, %zmm6, %zmm15 + vpminsq %zmm14, %zmm6, %zmm15{%k2} + vshufi32x4 $177, %zmm11, %zmm11, %zmm6 + vpmaxsq %zmm11, %zmm6, %zmm14 + vpminsq %zmm11, %zmm6, %zmm14{%k2} + vshufi32x4 $177, %zmm13, %zmm13, %zmm6 + vpmaxsq %zmm13, %zmm6, %zmm11 + vpminsq %zmm13, %zmm6, %zmm11{%k2} + vshufi32x4 $177, %zmm12, %zmm12, %zmm6 + vpmaxsq %zmm12, %zmm6, %zmm13 + vpminsq %zmm12, %zmm6, %zmm13{%k2} + vshufi32x4 $177, %zmm10, %zmm10, %zmm6 + vpmaxsq %zmm10, %zmm6, %zmm12 + vpminsq %zmm10, %zmm6, %zmm12{%k2} + vshufi32x4 $177, %zmm5, %zmm5, %zmm6 + vpmaxsq %zmm5, %zmm6, %zmm10 + vpminsq %zmm5, %zmm6, %zmm10{%k2} + vshufi32x4 $177, %zmm3, %zmm3, %zmm6 + vpmaxsq %zmm3, %zmm6, %zmm5 + vpminsq %zmm3, %zmm6, %zmm5{%k2} + vshufi32x4 $177, %zmm2, %zmm2, %zmm6 + vpmaxsq %zmm2, %zmm6, %zmm3 + vpminsq %zmm2, %zmm6, %zmm3{%k2} + vshufi32x4 $177, %zmm4, %zmm4, %zmm6 + vpmaxsq %zmm4, %zmm6, %zmm2 + vpminsq %zmm4, %zmm6, %zmm2{%k2} + vshufi32x4 $177, %zmm1, %zmm1, %zmm6 + vpmaxsq %zmm1, %zmm6, %zmm4 + vpminsq %zmm1, %zmm6, %zmm4{%k2} + vshufi32x4 $177, %zmm9, %zmm9, %zmm6 + vpmaxsq %zmm9, %zmm6, %zmm1 + vpminsq %zmm9, %zmm6, %zmm1{%k2} + vshufi32x4 $177, %zmm8, %zmm8, %zmm6 + vpmaxsq %zmm8, %zmm6, %zmm9 + vpminsq %zmm8, %zmm6, %zmm9{%k2} + vshufi32x4 $177, %zmm7, %zmm7, %zmm6 + vpmaxsq %zmm7, %zmm6, %zmm8 + vpminsq %zmm7, %zmm6, %zmm8{%k2} + vshufi32x4 $177, %zmm17, %zmm17, %zmm6 + vpmaxsq %zmm17, %zmm6, %zmm7 + vpminsq %zmm17, %zmm6, %zmm7{%k2} + vshufi32x4 $177, %zmm0, %zmm0, %zmm17 + vpmaxsq %zmm0, %zmm17, %zmm6 + vpminsq %zmm0, %zmm17, %zmm6{%k2} + vpshufd $78, %zmm16, %zmm0 + vpmaxsq %zmm0, %zmm16, %zmm17 + vpminsq %zmm0, %zmm16, %zmm17{%k1} + vpshufd $78, %zmm15, %zmm0 + vpmaxsq %zmm0, %zmm15, %zmm16 + vpminsq %zmm0, %zmm15, %zmm16{%k1} + vpshufd $78, %zmm14, %zmm0 + vpmaxsq %zmm0, %zmm14, %zmm15 + vpminsq %zmm0, %zmm14, %zmm15{%k1} + vpshufd $78, %zmm11, %zmm0 + vpmaxsq %zmm0, %zmm11, %zmm14 + vpminsq %zmm0, %zmm11, %zmm14{%k1} + vpshufd $78, %zmm13, %zmm0 + vpmaxsq %zmm0, %zmm13, %zmm11 + vpminsq %zmm0, %zmm13, %zmm11{%k1} + vpshufd $78, %zmm12, %zmm0 + vpmaxsq %zmm0, %zmm12, %zmm13 + vpminsq %zmm0, %zmm12, %zmm13{%k1} + vpshufd $78, %zmm10, %zmm0 + vpmaxsq %zmm0, %zmm10, %zmm12 + vpminsq %zmm0, %zmm10, %zmm12{%k1} + vpshufd $78, %zmm5, %zmm0 + vpmaxsq %zmm0, %zmm5, %zmm10 + vpminsq %zmm0, %zmm5, %zmm10{%k1} + vpshufd $78, %zmm3, %zmm0 + vpmaxsq %zmm0, %zmm3, %zmm5 + vpminsq %zmm0, %zmm3, %zmm5{%k1} + vpshufd $78, %zmm2, %zmm0 + vpmaxsq %zmm0, %zmm2, %zmm3 + vpminsq %zmm0, %zmm2, %zmm3{%k1} + vpshufd $78, %zmm4, %zmm0 + vpmaxsq %zmm0, %zmm4, %zmm2 + vpminsq %zmm0, %zmm4, %zmm2{%k1} + vpshufd $78, %zmm1, %zmm0 + vpmaxsq %zmm0, %zmm1, %zmm4 + vpminsq %zmm0, %zmm1, %zmm4{%k1} + vpshufd $78, %zmm9, %zmm0 + vpmaxsq %zmm0, %zmm9, %zmm1 + vpminsq %zmm0, %zmm9, %zmm1{%k1} + vpshufd $78, %zmm8, %zmm9 + vpmaxsq %zmm9, %zmm8, %zmm0 + vpminsq %zmm9, %zmm8, %zmm0{%k1} + vpshufd $78, %zmm7, %zmm9 + vpmaxsq %zmm9, %zmm7, %zmm8 + vpminsq %zmm9, %zmm7, %zmm8{%k1} + vpshufd $78, %zmm6, %zmm9 + vpmaxsq %zmm9, %zmm6, %zmm7 + vpminsq %zmm9, %zmm6, %zmm7{%k1} +.L170: + vmovdqu64 %zmm17, (%rdi) + vmovq %xmm18, %rdi + vmovdqu64 %zmm16, (%rdi) + vmovdqu64 %zmm15, (%r14) + vmovdqu64 %zmm14, 0(%r13) + vmovdqu64 %zmm11, (%r12) + vmovdqu64 %zmm13, (%rbx) + vmovq %xmm23, %rbx + vmovdqu64 %zmm12, (%r11) + vmovdqu64 %zmm10, (%r10) + vmovdqu64 %zmm5, (%r9) + vmovdqu64 %zmm3, (%r8) + vmovdqu64 %zmm2, (%rbx) + vmovdqu64 %zmm4, (%rsi) + vmovdqu64 %zmm1, (%rcx) + vmovdqu64 %zmm0, (%rdx) + vmovdqu64 %zmm8, (%rax) + movq -16(%rsp), %rax + vmovdqu64 %zmm7, (%rax) + vzeroupper + leaq -40(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE18788: + .size _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0, .-_ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + .section .text._ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, @function +_ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0: +.LFB18789: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + vmovdqa %ymm0, %ymm3 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + movq %rdx, %r15 + pushq %r14 + .cfi_offset 14, -32 + movq %rsi, %r14 + pushq %r13 + pushq %r12 + .cfi_offset 13, -40 + .cfi_offset 12, -48 + movq %rdi, %r12 + pushq %rbx + andq $-32, %rsp + subq $96, %rsp + .cfi_offset 3, -56 + cmpq $3, %rsi + jbe .L187 + movl $4, %r13d + xorl %ebx, %ebx + jmp .L178 + .p2align 4,,10 + .p2align 3 +.L174: + vmovmskpd %ymm2, %eax + vmovdqu %ymm3, (%rsi) + popcntq %rax, %rax + addq %rax, %rbx + leaq 4(%r13), %rax + cmpq %r14, %rax + ja .L195 + movq %rax, %r13 +.L178: + vpcmpeqq -32(%r12,%r13,8), %ymm3, %ymm2 + vpcmpeqq -32(%r12,%r13,8), %ymm1, %ymm0 + leaq -4(%r13), %rdx + leaq (%r12,%rbx,8), %rsi + vpor %ymm0, %ymm2, %ymm4 + vmovmskpd %ymm4, %eax + cmpl $15, %eax + je .L174 + vpcmpeqd %ymm3, %ymm3, %ymm3 + vpxor %ymm3, %ymm0, %ymm0 + vpandn %ymm0, %ymm2, %ymm2 + vmovmskpd %ymm2, %eax + tzcntl %eax, %eax + addq %rdx, %rax + vpbroadcastq (%r12,%rax,8), %ymm0 + leaq 4(%rbx), %rax + vmovdqa %ymm0, (%r15) + cmpq %rax, %rdx + jb .L175 + .p2align 4,,10 + .p2align 3 +.L176: + vmovdqu %ymm1, -32(%r12,%rax,8) + movq %rax, %rbx + addq $4, %rax + cmpq %rdx, %rax + jbe .L176 +.L175: + subq %rbx, %rdx + xorl %eax, %eax + vmovq %rdx, %xmm7 + vpbroadcastq %xmm7, %ymm0 + vpcmpgtq .LC3(%rip), %ymm0, %ymm0 + vpmaskmovq %ymm1, %ymm0, (%r12,%rbx,8) +.L172: + leaq -40(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L195: + .cfi_restore_state + movq %r14, %r8 + leaq 0(,%r13,8), %rsi + leaq (%r12,%rbx,8), %r9 + subq %r13, %r8 +.L173: + testq %r8, %r8 + je .L182 + leaq 0(,%r8,8), %rdx + addq %r12, %rsi + movq %rcx, %rdi + movq %r9, 80(%rsp) + movq %r8, 88(%rsp) + vmovdqa %ymm1, (%rsp) + vmovdqa %ymm3, 32(%rsp) + vzeroupper + call memcpy@PLT + movq 88(%rsp), %r8 + movq 80(%rsp), %r9 + vmovdqa 32(%rsp), %ymm3 + vmovdqa (%rsp), %ymm1 + movq %rax, %rcx +.L182: + vmovdqa (%rcx), %ymm0 + vmovq %r8, %xmm6 + vmovdqa .LC3(%rip), %ymm5 + vpbroadcastq %xmm6, %ymm2 + vpcmpeqq %ymm3, %ymm0, %ymm4 + vpcmpgtq %ymm5, %ymm2, %ymm2 + vpcmpeqq %ymm1, %ymm0, %ymm0 + vpand %ymm2, %ymm4, %ymm7 + vpor %ymm4, %ymm0, %ymm0 + vpcmpeqd %ymm4, %ymm4, %ymm4 + vpxor %ymm4, %ymm2, %ymm6 + vpor %ymm6, %ymm0, %ymm0 + vmovmskpd %ymm0, %eax + cmpl $15, %eax + jne .L196 + vmovmskpd %ymm7, %edx + vpmaskmovq %ymm3, %ymm2, (%r9) + popcntq %rdx, %rdx + addq %rbx, %rdx + leaq 4(%rdx), %rax + cmpq %rax, %r14 + jb .L185 + .p2align 4,,10 + .p2align 3 +.L186: + vmovdqu %ymm1, -32(%r12,%rax,8) + movq %rax, %rdx + addq $4, %rax + cmpq %rax, %r14 + jnb .L186 +.L185: + subq %rdx, %r14 + movl $1, %eax + vmovq %r14, %xmm6 + vpbroadcastq %xmm6, %ymm0 + vpcmpgtq %ymm5, %ymm0, %ymm0 + vpmaskmovq %ymm1, %ymm0, (%r12,%rdx,8) + leaq -40(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret +.L187: + .cfi_restore_state + movq %rsi, %r8 + movq %rdi, %r9 + xorl %esi, %esi + xorl %ebx, %ebx + xorl %r13d, %r13d + jmp .L173 +.L196: + vpxor %ymm4, %ymm0, %ymm0 + vmovmskpd %ymm0, %eax + tzcntl %eax, %eax + addq %r13, %rax + vpbroadcastq (%r12,%rax,8), %ymm0 + leaq 4(%rbx), %rax + vmovdqa %ymm0, (%r15) + cmpq %r13, %rax + ja .L183 + .p2align 4,,10 + .p2align 3 +.L184: + vmovdqu %ymm1, -32(%r12,%rax,8) + movq %rax, %rbx + leaq 4(%rax), %rax + cmpq %r13, %rax + jbe .L184 + leaq (%r12,%rbx,8), %r9 +.L183: + subq %rbx, %r13 + xorl %eax, %eax + vmovq %r13, %xmm7 + vpbroadcastq %xmm7, %ymm0 + vpcmpgtq %ymm5, %ymm0, %ymm0 + vpmaskmovq %ymm1, %ymm0, (%r9) + jmp .L172 + .cfi_endproc +.LFE18789: + .size _ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, .-_ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + .section .text._ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0, @function +_ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0: +.LFB18790: + .cfi_startproc + cmpq %rdx, %rsi + jbe .L210 + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r8 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rax, %rsi + jbe .L210 + movq (%rdi,%rdx,8), %r11 + vmovq %r11, %xmm4 + vpunpcklqdq %xmm4, %xmm4, %xmm0 + jmp .L200 + .p2align 4,,10 + .p2align 3 +.L201: + cmpq %rcx, %rsi + jbe .L210 + movq %rdx, %rax +.L206: + salq $4, %r8 + vmovddup (%rdi,%r8), %xmm1 + vpcmpgtq %xmm3, %xmm1, %xmm1 + vmovmskpd %xmm1, %r8d + andl $1, %r8d + jne .L203 +.L202: + cmpq %rdx, %rax + je .L210 + leaq (%rdi,%rax,8), %rdx + movq (%rdx), %rcx + movq %rcx, (%r10) + movq %r11, (%rdx) + cmpq %rax, %rsi + jbe .L211 + movq %rax, %rdx +.L204: + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r8 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rsi, %rax + jnb .L210 +.L200: + vmovddup (%rdi,%rax,8), %xmm1 + vpcmpgtq %xmm0, %xmm1, %xmm2 + leaq (%rdi,%rdx,8), %r10 + vmovdqa %xmm0, %xmm3 + vmovmskpd %xmm2, %r9d + andl $1, %r9d + je .L201 + cmpq %rcx, %rsi + jbe .L202 + vmovdqa %xmm1, %xmm3 + jmp .L206 + .p2align 4,,10 + .p2align 3 +.L203: + cmpq %rdx, %rcx + je .L212 + leaq (%rdi,%rcx,8), %rax + movq (%rax), %rdx + movq %rdx, (%r10) + movq %rcx, %rdx + movq %r11, (%rax) + jmp .L204 + .p2align 4,,10 + .p2align 3 +.L210: + ret + .p2align 4,,10 + .p2align 3 +.L211: + ret +.L212: + ret + .cfi_endproc +.LFE18790: + .size _ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0, .-_ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + .section .text._ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, @function +_ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0: +.LFB18791: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movdqa %xmm0, %xmm3 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + movq %rdx, %r15 + pushq %r14 + .cfi_offset 14, -32 + movq %rsi, %r14 + pushq %r13 + .cfi_offset 13, -40 + movq %rdi, %r13 + pushq %r12 + pushq %rbx + subq $56, %rsp + .cfi_offset 12, -48 + .cfi_offset 3, -56 + cmpq $1, %rsi + jbe .L236 + movl $2, %r12d + xorl %ebx, %ebx + jmp .L221 + .p2align 4,,10 + .p2align 3 +.L215: + movmskpd %xmm2, %eax + movups %xmm3, 0(%r13,%rbx,8) + popcntq %rax, %rax + addq %rax, %rbx + leaq 2(%r12), %rax + cmpq %r14, %rax + ja .L268 + movq %rax, %r12 +.L221: + movdqu -16(%r13,%r12,8), %xmm2 + movdqu -16(%r13,%r12,8), %xmm0 + leaq -2(%r12), %rdx + leaq 0(,%rbx,8), %rax + pcmpeqq %xmm3, %xmm2 + pcmpeqq %xmm1, %xmm0 + movdqa %xmm2, %xmm4 + por %xmm0, %xmm4 + movmskpd %xmm4, %esi + cmpl $3, %esi + je .L215 + pcmpeqd %xmm3, %xmm3 + leaq 2(%rbx), %rdi + pxor %xmm3, %xmm0 + pandn %xmm0, %xmm2 + movmskpd %xmm2, %ecx + rep bsfl %ecx, %ecx + movslq %ecx, %rcx + addq %rdx, %rcx + movddup 0(%r13,%rcx,8), %xmm0 + movaps %xmm0, (%r15) + cmpq %rdx, %rdi + ja .L216 + movq %rdx, %rcx + addq %r13, %rax + subq %rbx, %rcx + leaq -2(%rcx), %rsi + movq %rsi, %rcx + andq $-2, %rcx + addq %rbx, %rcx + leaq 16(%r13,%rcx,8), %rcx + .p2align 4,,10 + .p2align 3 +.L217: + movups %xmm1, (%rax) + addq $16, %rax + cmpq %rcx, %rax + jne .L217 + andq $-2, %rsi + leaq (%rsi,%rdi), %rbx +.L216: + subq %rbx, %rdx + leaq 0(,%rbx,8), %rcx + movq %rdx, %xmm0 + punpcklqdq %xmm0, %xmm0 + pcmpgtq .LC0(%rip), %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L218 + movq %xmm1, 0(%r13,%rbx,8) +.L218: + pextrq $1, %xmm0, %rax + testq %rax, %rax + jne .L269 +.L229: + addq $56, %rsp + xorl %eax, %eax + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L269: + .cfi_restore_state + pextrq $1, %xmm1, 8(%r13,%rcx) + jmp .L229 + .p2align 4,,10 + .p2align 3 +.L268: + movq %r14, %r8 + leaq 0(,%r12,8), %rsi + leaq 0(,%rbx,8), %r9 + subq %r12, %r8 +.L214: + testq %r8, %r8 + je .L225 + leaq 0(,%r8,8), %rdx + movq %rcx, %rdi + addq %r13, %rsi + movq %r9, -64(%rbp) + movq %r8, -56(%rbp) + movaps %xmm1, -96(%rbp) + movaps %xmm3, -80(%rbp) + call memcpy@PLT + movq -56(%rbp), %r8 + movq -64(%rbp), %r9 + movdqa -80(%rbp), %xmm3 + movdqa -96(%rbp), %xmm1 + movq %rax, %rcx +.L225: + movdqa (%rcx), %xmm0 + movdqa .LC0(%rip), %xmm4 + movq %r8, %xmm2 + punpcklqdq %xmm2, %xmm2 + movdqa %xmm0, %xmm5 + pcmpgtq %xmm4, %xmm2 + pcmpeqq %xmm3, %xmm5 + pcmpeqq %xmm1, %xmm0 + movdqa %xmm2, %xmm6 + movdqa %xmm5, %xmm7 + por %xmm5, %xmm0 + pcmpeqd %xmm5, %xmm5 + pxor %xmm5, %xmm6 + pand %xmm2, %xmm7 + por %xmm6, %xmm0 + movmskpd %xmm0, %eax + cmpl $3, %eax + jne .L270 + movq %xmm2, %rax + testq %rax, %rax + je .L230 + movq %xmm3, 0(%r13,%r9) +.L230: + pextrq $1, %xmm2, %rax + testq %rax, %rax + jne .L271 +.L231: + movmskpd %xmm7, %edx + popcntq %rdx, %rdx + addq %rbx, %rdx + leaq 2(%rdx), %rax + cmpq %rax, %r14 + jb .L232 + .p2align 4,,10 + .p2align 3 +.L233: + movups %xmm1, -16(%r13,%rax,8) + movq %rax, %rdx + addq $2, %rax + cmpq %rax, %r14 + jnb .L233 +.L232: + subq %rdx, %r14 + leaq 0(,%rdx,8), %rcx + movq %r14, %xmm0 + punpcklqdq %xmm0, %xmm0 + pcmpgtq %xmm4, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L234 + movq %xmm1, 0(%r13,%rdx,8) +.L234: + pextrq $1, %xmm0, %rax + testq %rax, %rax + je .L235 + pextrq $1, %xmm1, 8(%r13,%rcx) +.L235: + addq $56, %rsp + movl $1, %eax + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L271: + .cfi_restore_state + pextrq $1, %xmm3, 8(%r13,%r9) + jmp .L231 +.L236: + movq %rsi, %r8 + xorl %r9d, %r9d + xorl %esi, %esi + xorl %ebx, %ebx + xorl %r12d, %r12d + jmp .L214 +.L270: + pxor %xmm5, %xmm0 + leaq 2(%rbx), %rsi + movmskpd %xmm0, %eax + rep bsfl %eax, %eax + cltq + addq %r12, %rax + movddup 0(%r13,%rax,8), %xmm0 + movaps %xmm0, (%r15) + cmpq %r12, %rsi + ja .L226 + leaq -2(%r12), %rcx + leaq 0(%r13,%rbx,8), %rax + subq %rbx, %rcx + movq %rcx, %rdx + andq $-2, %rdx + addq %rbx, %rdx + leaq 16(%r13,%rdx,8), %rdx + .p2align 4,,10 + .p2align 3 +.L227: + movups %xmm1, (%rax) + addq $16, %rax + cmpq %rax, %rdx + jne .L227 + andq $-2, %rcx + leaq (%rcx,%rsi), %rbx + leaq 0(,%rbx,8), %r9 +.L226: + subq %rbx, %r12 + movq %r12, %xmm0 + punpcklqdq %xmm0, %xmm0 + pcmpgtq %xmm4, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L228 + movq %xmm1, 0(%r13,%r9) +.L228: + pextrq $1, %xmm0, %rax + testq %rax, %rax + je .L229 + pextrq $1, %xmm1, 8(%r13,%r9) + jmp .L229 + .cfi_endproc +.LFE18791: + .size _ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, .-_ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + .section .text._ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0, @function +_ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0: +.LFB18792: + .cfi_startproc + cmpq %rdx, %rsi + jbe .L272 + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r8 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rax, %rsi + jbe .L272 + movq (%rdi,%rdx,8), %r11 + movq %r11, %xmm4 + movddup %xmm4, %xmm0 + jmp .L275 + .p2align 4,,10 + .p2align 3 +.L276: + cmpq %rcx, %rsi + jbe .L272 + movq %rdx, %rax +.L281: + salq $4, %r8 + movddup (%rdi,%r8), %xmm1 + pcmpgtq %xmm3, %xmm1 + movmskpd %xmm1, %r8d + andl $1, %r8d + jne .L278 +.L277: + cmpq %rdx, %rax + je .L272 + leaq (%rdi,%rax,8), %rdx + movq (%rdx), %rcx + movq %rcx, (%r10) + movq %r11, (%rdx) + cmpq %rax, %rsi + jbe .L285 + movq %rax, %rdx +.L279: + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r8 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rsi, %rax + jnb .L272 +.L275: + movddup (%rdi,%rax,8), %xmm1 + movdqa %xmm1, %xmm2 + leaq (%rdi,%rdx,8), %r10 + movdqa %xmm0, %xmm3 + pcmpgtq %xmm0, %xmm2 + movmskpd %xmm2, %r9d + andl $1, %r9d + je .L276 + cmpq %rcx, %rsi + jbe .L277 + movdqa %xmm1, %xmm3 + jmp .L281 + .p2align 4,,10 + .p2align 3 +.L278: + cmpq %rdx, %rcx + je .L286 + leaq (%rdi,%rcx,8), %rax + movq (%rax), %rdx + movq %rdx, (%r10) + movq %rcx, %rdx + movq %r11, (%rax) + jmp .L279 + .p2align 4,,10 + .p2align 3 +.L272: + ret + .p2align 4,,10 + .p2align 3 +.L285: + ret +.L286: + ret + .cfi_endproc +.LFE18792: + .size _ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0, .-_ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + .section .text._ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0, @function +_ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0: +.LFB18793: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsi, %rax + salq $3, %rax + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + leaq (%rdi,%rax), %r15 + pushq %r14 + .cfi_offset 14, -32 + leaq (%r15,%rax), %r14 + pushq %r13 + .cfi_offset 13, -40 + leaq (%r14,%rax), %r13 + pushq %r12 + .cfi_offset 12, -48 + leaq 0(%r13,%rax), %r12 + pushq %rbx + .cfi_offset 3, -56 + leaq (%r12,%rax), %rbx + leaq (%rbx,%rax), %r11 + leaq (%r11,%rax), %r10 + addq $-128, %rsp + leaq (%r10,%rax), %r9 + leaq (%r9,%rax), %r8 + movq %rdi, -136(%rbp) + movq %rsi, -160(%rbp) + movdqu (%r15), %xmm14 + movdqu (%rdi), %xmm7 + leaq (%r8,%rax), %rdi + movdqu 0(%r13), %xmm3 + movdqu (%r14), %xmm8 + leaq (%rdi,%rax), %rsi + movdqa %xmm14, %xmm0 + movdqa %xmm14, %xmm12 + movdqu (%rbx), %xmm13 + movdqu (%r12), %xmm1 + pcmpgtq %xmm7, %xmm0 + movdqu (%r10), %xmm11 + movdqu (%r11), %xmm6 + leaq (%rsi,%rax), %rcx + movdqu (%r8), %xmm10 + leaq (%rcx,%rax), %rdx + movdqu (%rsi), %xmm2 + movdqu (%rdx), %xmm4 + movdqu (%rdi), %xmm5 + movq %rdx, -120(%rbp) + addq %rax, %rdx + pblendvb %xmm0, %xmm7, %xmm12 + addq %rdx, %rax + movdqu (%rcx), %xmm15 + movq %rdx, -128(%rbp) + pblendvb %xmm0, %xmm14, %xmm7 + movdqa %xmm3, %xmm0 + movdqa %xmm3, %xmm14 + movaps %xmm12, -80(%rbp) + pcmpgtq %xmm8, %xmm0 + movdqu (%r9), %xmm12 + movdqu (%rax), %xmm9 + movaps %xmm4, -64(%rbp) + movdqu (%rdx), %xmm4 + pblendvb %xmm0, %xmm8, %xmm14 + pblendvb %xmm0, %xmm3, %xmm8 + movdqa %xmm13, %xmm0 + pcmpgtq %xmm1, %xmm0 + movdqa %xmm13, %xmm3 + pblendvb %xmm0, %xmm1, %xmm3 + pblendvb %xmm0, %xmm13, %xmm1 + movdqa %xmm11, %xmm0 + pcmpgtq %xmm6, %xmm0 + movdqa %xmm11, %xmm13 + pblendvb %xmm0, %xmm6, %xmm13 + pblendvb %xmm0, %xmm11, %xmm6 + movdqu (%r9), %xmm11 + movdqa %xmm10, %xmm0 + pcmpgtq %xmm11, %xmm0 + movdqa %xmm10, %xmm11 + pblendvb %xmm0, %xmm12, %xmm11 + movaps %xmm11, -96(%rbp) + movdqa %xmm12, %xmm11 + movdqa -80(%rbp), %xmm12 + pblendvb %xmm0, %xmm10, %xmm11 + movdqa %xmm2, %xmm0 + movdqa %xmm2, %xmm10 + pcmpgtq %xmm5, %xmm0 + movaps %xmm11, -112(%rbp) + pblendvb %xmm0, %xmm5, %xmm10 + pblendvb %xmm0, %xmm2, %xmm5 + movdqa %xmm10, %xmm11 + movdqa -64(%rbp), %xmm10 + movdqa %xmm10, %xmm0 + movdqa %xmm10, %xmm2 + pcmpgtq %xmm15, %xmm0 + pblendvb %xmm0, %xmm15, %xmm2 + pblendvb %xmm0, %xmm10, %xmm15 + movdqa %xmm9, %xmm0 + pcmpgtq %xmm4, %xmm0 + movdqa %xmm9, %xmm10 + pblendvb %xmm0, %xmm4, %xmm10 + pblendvb %xmm0, %xmm9, %xmm4 + movdqa %xmm14, %xmm0 + pcmpgtq %xmm12, %xmm0 + movdqa %xmm14, %xmm9 + pblendvb %xmm0, %xmm12, %xmm9 + pblendvb %xmm0, %xmm14, %xmm12 + movdqa %xmm8, %xmm0 + pcmpgtq %xmm7, %xmm0 + movdqa %xmm8, %xmm14 + pblendvb %xmm0, %xmm7, %xmm14 + pblendvb %xmm0, %xmm8, %xmm7 + movdqa %xmm13, %xmm0 + pcmpgtq %xmm3, %xmm0 + movaps %xmm7, -176(%rbp) + movdqa %xmm13, %xmm7 + movdqa %xmm6, %xmm8 + pblendvb %xmm0, %xmm3, %xmm7 + pblendvb %xmm0, %xmm13, %xmm3 + movdqa %xmm6, %xmm0 + movdqa -96(%rbp), %xmm13 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm1, %xmm8 + pblendvb %xmm0, %xmm6, %xmm1 + movdqa %xmm11, %xmm0 + pcmpgtq %xmm13, %xmm0 + movdqa %xmm11, %xmm6 + movaps %xmm1, -64(%rbp) + movdqa %xmm5, %xmm1 + pblendvb %xmm0, %xmm13, %xmm6 + pblendvb %xmm0, %xmm11, %xmm13 + movdqa %xmm5, %xmm0 + movdqa -112(%rbp), %xmm11 + pcmpgtq %xmm11, %xmm0 + pblendvb %xmm0, %xmm11, %xmm1 + pblendvb %xmm0, %xmm5, %xmm11 + movdqa %xmm10, %xmm0 + pcmpgtq %xmm2, %xmm0 + movdqa %xmm10, %xmm5 + movaps %xmm11, -80(%rbp) + movdqa -176(%rbp), %xmm11 + movaps %xmm1, -96(%rbp) + pblendvb %xmm0, %xmm2, %xmm5 + pblendvb %xmm0, %xmm10, %xmm2 + movdqa %xmm4, %xmm0 + pcmpgtq %xmm15, %xmm0 + movdqa %xmm4, %xmm10 + pblendvb %xmm0, %xmm15, %xmm10 + pblendvb %xmm0, %xmm4, %xmm15 + movdqa %xmm7, %xmm0 + pcmpgtq %xmm9, %xmm0 + movdqa %xmm7, %xmm4 + pblendvb %xmm0, %xmm9, %xmm4 + pblendvb %xmm0, %xmm7, %xmm9 + movdqa %xmm8, %xmm0 + pcmpgtq %xmm14, %xmm0 + movdqa %xmm8, %xmm7 + pblendvb %xmm0, %xmm14, %xmm7 + pblendvb %xmm0, %xmm8, %xmm14 + movdqa %xmm3, %xmm0 + pcmpgtq %xmm12, %xmm0 + movdqa %xmm3, %xmm8 + pblendvb %xmm0, %xmm12, %xmm8 + pblendvb %xmm0, %xmm3, %xmm12 + movdqa -64(%rbp), %xmm3 + movdqa %xmm3, %xmm0 + movdqa %xmm3, %xmm1 + pcmpgtq %xmm11, %xmm0 + pblendvb %xmm0, %xmm11, %xmm1 + pblendvb %xmm0, -64(%rbp), %xmm11 + movdqa %xmm5, %xmm0 + pcmpgtq %xmm6, %xmm0 + movdqa %xmm1, %xmm3 + movaps %xmm11, -112(%rbp) + movdqa %xmm5, %xmm1 + movdqa -96(%rbp), %xmm11 + pblendvb %xmm0, %xmm6, %xmm1 + pblendvb %xmm0, %xmm5, %xmm6 + movdqa %xmm10, %xmm0 + pcmpgtq %xmm11, %xmm0 + movdqa %xmm10, %xmm5 + pblendvb %xmm0, %xmm11, %xmm5 + pblendvb %xmm0, %xmm10, %xmm11 + movdqa %xmm2, %xmm0 + pcmpgtq %xmm13, %xmm0 + movdqa %xmm2, %xmm10 + movaps %xmm11, -64(%rbp) + movdqa %xmm15, %xmm11 + pblendvb %xmm0, %xmm13, %xmm10 + pblendvb %xmm0, %xmm2, %xmm13 + movdqa -80(%rbp), %xmm2 + movdqa %xmm15, %xmm0 + pcmpgtq %xmm2, %xmm0 + pblendvb %xmm0, %xmm2, %xmm11 + movdqa %xmm11, %xmm2 + movdqa -80(%rbp), %xmm11 + pblendvb %xmm0, %xmm15, %xmm11 + movdqa %xmm1, %xmm0 + movdqa %xmm1, %xmm15 + pcmpgtq %xmm4, %xmm0 + pblendvb %xmm0, %xmm4, %xmm15 + pblendvb %xmm0, %xmm1, %xmm4 + movdqa %xmm5, %xmm0 + pcmpgtq %xmm7, %xmm0 + movaps %xmm15, -80(%rbp) + movdqa %xmm10, %xmm1 + movaps %xmm15, -256(%rbp) + movdqa %xmm5, %xmm15 + pblendvb %xmm0, %xmm7, %xmm15 + pblendvb %xmm0, %xmm5, %xmm7 + movdqa %xmm10, %xmm0 + pcmpgtq %xmm8, %xmm0 + movdqa %xmm2, %xmm5 + pblendvb %xmm0, %xmm8, %xmm1 + pblendvb %xmm0, %xmm10, %xmm8 + movdqa %xmm2, %xmm0 + pcmpgtq %xmm3, %xmm0 + movdqa %xmm14, %xmm10 + pblendvb %xmm0, %xmm3, %xmm5 + pblendvb %xmm0, %xmm2, %xmm3 + movdqa %xmm6, %xmm0 + pcmpgtq %xmm9, %xmm0 + movdqa %xmm6, %xmm2 + pblendvb %xmm0, %xmm9, %xmm2 + pblendvb %xmm0, %xmm6, %xmm9 + movdqa -64(%rbp), %xmm6 + movdqa %xmm6, %xmm0 + pcmpgtq %xmm14, %xmm0 + pblendvb %xmm0, -64(%rbp), %xmm10 + pblendvb %xmm0, %xmm14, %xmm6 + movdqa %xmm13, %xmm0 + pcmpgtq %xmm12, %xmm0 + movdqa %xmm13, %xmm14 + pblendvb %xmm0, %xmm12, %xmm14 + pblendvb %xmm0, %xmm13, %xmm12 + movdqa %xmm11, %xmm0 + movaps %xmm14, -64(%rbp) + movdqa %xmm11, %xmm14 + movdqa -112(%rbp), %xmm11 + movdqa %xmm14, %xmm13 + pcmpgtq %xmm11, %xmm0 + pblendvb %xmm0, %xmm11, %xmm13 + pblendvb %xmm0, %xmm14, %xmm11 + movdqa %xmm8, %xmm0 + pcmpgtq %xmm6, %xmm0 + movaps %xmm11, -176(%rbp) + movdqa %xmm8, %xmm14 + movdqa -64(%rbp), %xmm11 + pblendvb %xmm0, %xmm6, %xmm14 + pblendvb %xmm0, %xmm8, %xmm6 + movdqa %xmm7, %xmm0 + pcmpgtq %xmm11, %xmm0 + movdqa %xmm7, %xmm8 + pblendvb %xmm0, %xmm11, %xmm8 + pblendvb %xmm0, %xmm7, %xmm11 + movdqa %xmm9, %xmm0 + pcmpgtq %xmm5, %xmm0 + movdqa %xmm9, %xmm7 + pblendvb %xmm0, %xmm5, %xmm7 + pblendvb %xmm0, %xmm9, %xmm5 + movdqa %xmm3, %xmm0 + pcmpgtq %xmm13, %xmm0 + movdqa %xmm3, %xmm9 + pblendvb %xmm0, %xmm13, %xmm9 + pblendvb %xmm0, %xmm3, %xmm13 + movdqa %xmm12, %xmm0 + pcmpgtq %xmm10, %xmm0 + movdqa %xmm12, %xmm3 + pblendvb %xmm0, %xmm10, %xmm3 + pblendvb %xmm0, %xmm12, %xmm10 + movdqa %xmm4, %xmm0 + pcmpgtq %xmm2, %xmm0 + movdqa %xmm4, %xmm12 + pblendvb %xmm0, %xmm2, %xmm12 + pblendvb %xmm0, %xmm4, %xmm2 + movdqa %xmm1, %xmm0 + pcmpgtq %xmm15, %xmm0 + movdqa %xmm1, %xmm4 + pblendvb %xmm0, %xmm15, %xmm4 + pblendvb %xmm0, %xmm1, %xmm15 + movdqa %xmm12, %xmm0 + pcmpgtq %xmm4, %xmm0 + movdqa %xmm12, %xmm1 + pblendvb %xmm0, %xmm4, %xmm1 + pblendvb %xmm0, %xmm12, %xmm4 + movdqa %xmm3, %xmm0 + pcmpgtq %xmm9, %xmm0 + movdqa %xmm3, %xmm12 + movaps %xmm1, -272(%rbp) + movaps %xmm1, -64(%rbp) + movdqa %xmm2, %xmm1 + pblendvb %xmm0, %xmm9, %xmm12 + pblendvb %xmm0, %xmm3, %xmm9 + movdqa %xmm2, %xmm0 + pcmpgtq %xmm15, %xmm0 + movdqa %xmm4, %xmm3 + pblendvb %xmm0, %xmm15, %xmm1 + pblendvb %xmm0, %xmm2, %xmm15 + movdqa %xmm10, %xmm0 + pcmpgtq %xmm13, %xmm0 + movdqa %xmm10, %xmm2 + pblendvb %xmm0, %xmm13, %xmm2 + pblendvb %xmm0, %xmm10, %xmm13 + movdqa %xmm4, %xmm0 + pcmpgtq %xmm1, %xmm0 + movaps %xmm13, -192(%rbp) + movdqa %xmm8, %xmm10 + pblendvb %xmm0, %xmm1, %xmm3 + pblendvb %xmm0, %xmm4, %xmm1 + movdqa %xmm8, %xmm0 + pcmpgtq %xmm14, %xmm0 + movdqa %xmm1, %xmm13 + movdqa %xmm14, %xmm1 + movaps %xmm3, -288(%rbp) + movdqa %xmm11, %xmm4 + movaps %xmm3, -96(%rbp) + movdqa %xmm15, %xmm3 + pblendvb %xmm0, %xmm14, %xmm10 + pblendvb %xmm0, %xmm8, %xmm1 + movdqa %xmm6, %xmm0 + pcmpgtq %xmm11, %xmm0 + movdqa %xmm6, %xmm8 + pblendvb %xmm0, %xmm6, %xmm4 + pblendvb %xmm0, %xmm11, %xmm8 + movdqa %xmm9, %xmm0 + pcmpgtq %xmm2, %xmm0 + movdqa %xmm9, %xmm6 + movdqa %xmm8, %xmm11 + pblendvb %xmm0, %xmm2, %xmm6 + pblendvb %xmm0, %xmm9, %xmm2 + movdqa %xmm15, %xmm0 + pcmpgtq %xmm7, %xmm0 + movaps %xmm2, -224(%rbp) + movdqa %xmm5, %xmm2 + movdqa %xmm13, %xmm9 + movaps %xmm6, -208(%rbp) + pblendvb %xmm0, %xmm7, %xmm3 + pblendvb %xmm0, %xmm15, %xmm7 + movdqa %xmm5, %xmm0 + pcmpgtq %xmm12, %xmm0 + pblendvb %xmm0, %xmm12, %xmm2 + pblendvb %xmm0, %xmm5, %xmm12 + movdqa %xmm10, %xmm0 + pcmpgtq %xmm3, %xmm0 + movdqa %xmm10, %xmm5 + movdqa %xmm2, %xmm6 + movdqa %xmm7, %xmm2 + pblendvb %xmm0, %xmm3, %xmm5 + pblendvb %xmm0, %xmm10, %xmm3 + movdqa %xmm7, %xmm0 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm1, %xmm2 + pblendvb %xmm0, %xmm7, %xmm1 + movdqa %xmm8, %xmm0 + pcmpgtq %xmm6, %xmm0 + movdqa %xmm12, %xmm7 + movdqa %xmm2, %xmm14 + pblendvb %xmm0, %xmm6, %xmm11 + pblendvb %xmm0, %xmm8, %xmm6 + movdqa %xmm12, %xmm0 + pcmpgtq %xmm4, %xmm0 + pblendvb %xmm0, %xmm4, %xmm7 + pblendvb %xmm0, %xmm12, %xmm4 + movdqa %xmm13, %xmm0 + pcmpgtq %xmm5, %xmm0 + pblendvb %xmm0, %xmm5, %xmm9 + pblendvb %xmm0, %xmm13, %xmm5 + movdqa %xmm2, %xmm0 + pcmpgtq %xmm3, %xmm0 + movdqa %xmm5, %xmm10 + movdqa %xmm5, %xmm15 + movaps %xmm9, -112(%rbp) + movdqa %xmm7, %xmm5 + cmpq $1, -160(%rbp) + pblendvb %xmm0, %xmm3, %xmm14 + pblendvb %xmm0, %xmm2, %xmm3 + movdqa %xmm1, %xmm0 + pcmpgtq %xmm11, %xmm0 + movdqa %xmm1, %xmm2 + pblendvb %xmm0, %xmm11, %xmm2 + pblendvb %xmm0, %xmm1, %xmm11 + movdqa %xmm7, %xmm0 + pcmpgtq %xmm6, %xmm0 + movdqa %xmm6, %xmm1 + movdqa %xmm2, %xmm13 + pblendvb %xmm0, %xmm6, %xmm5 + movdqa -208(%rbp), %xmm6 + pblendvb %xmm0, %xmm7, %xmm1 + movdqa %xmm4, %xmm0 + movaps %xmm1, -240(%rbp) + movdqa %xmm4, %xmm1 + pcmpgtq %xmm6, %xmm0 + movdqa %xmm6, %xmm7 + pblendvb %xmm0, %xmm4, %xmm7 + pblendvb %xmm0, %xmm6, %xmm1 + movdqa %xmm2, %xmm0 + pcmpgtq %xmm3, %xmm0 + movdqa %xmm5, %xmm4 + pblendvb %xmm0, %xmm3, %xmm13 + pblendvb %xmm0, %xmm2, %xmm3 + movdqa %xmm5, %xmm0 + pcmpgtq %xmm11, %xmm0 + pblendvb %xmm0, %xmm11, %xmm4 + pblendvb %xmm0, %xmm5, %xmm11 + jbe .L291 + movdqa -256(%rbp), %xmm12 + pshufd $78, %xmm7, %xmm7 + pshufd $78, -176(%rbp), %xmm6 + movdqa %xmm6, %xmm0 + movdqa %xmm6, %xmm15 + pshufd $78, %xmm1, %xmm1 + pshufd $78, %xmm11, %xmm11 + pshufd $78, -192(%rbp), %xmm2 + pcmpgtq %xmm12, %xmm0 + pshufd $78, %xmm4, %xmm4 + pshufd $78, -224(%rbp), %xmm5 + pshufd $78, -240(%rbp), %xmm8 + pblendvb %xmm0, %xmm12, %xmm15 + pblendvb %xmm0, %xmm6, %xmm12 + movdqa %xmm2, %xmm0 + movaps %xmm12, -64(%rbp) + movdqa %xmm2, %xmm6 + movdqa -272(%rbp), %xmm12 + pcmpgtq %xmm12, %xmm0 + pblendvb %xmm0, %xmm12, %xmm6 + pblendvb %xmm0, %xmm2, %xmm12 + movdqa %xmm5, %xmm0 + movdqa -288(%rbp), %xmm2 + movaps %xmm12, -80(%rbp) + movdqa %xmm5, %xmm12 + pcmpgtq %xmm2, %xmm0 + pblendvb %xmm0, %xmm2, %xmm12 + pblendvb %xmm0, %xmm5, %xmm2 + movdqa %xmm7, %xmm0 + pcmpgtq %xmm9, %xmm0 + movdqa %xmm7, %xmm5 + movaps %xmm12, -96(%rbp) + pblendvb %xmm0, %xmm9, %xmm5 + pblendvb %xmm0, %xmm7, %xmm9 + movdqa %xmm1, %xmm0 + pcmpgtq %xmm10, %xmm0 + movdqa %xmm1, %xmm7 + movaps %xmm5, -112(%rbp) + pshufd $78, %xmm9, %xmm5 + pshufd $78, %xmm2, %xmm9 + pblendvb %xmm0, %xmm10, %xmm7 + pblendvb %xmm0, %xmm1, %xmm10 + movdqa %xmm8, %xmm0 + pcmpgtq %xmm14, %xmm0 + movdqa %xmm8, %xmm1 + pshufd $78, %xmm7, %xmm7 + movaps %xmm10, -160(%rbp) + pblendvb %xmm0, %xmm14, %xmm1 + movdqa %xmm1, %xmm10 + movdqa %xmm14, %xmm1 + movdqa %xmm4, %xmm14 + pblendvb %xmm0, %xmm8, %xmm1 + movdqa %xmm11, %xmm0 + movdqa %xmm11, %xmm8 + pcmpgtq %xmm13, %xmm0 + pshufd $78, %xmm10, %xmm10 + pblendvb %xmm0, %xmm13, %xmm8 + pblendvb %xmm0, %xmm11, %xmm13 + movdqa %xmm4, %xmm0 + pshufd $78, -80(%rbp), %xmm11 + pcmpgtq %xmm3, %xmm0 + pshufd $78, %xmm8, %xmm8 + pblendvb %xmm0, %xmm3, %xmm14 + pblendvb %xmm0, %xmm4, %xmm3 + pshufd $78, -64(%rbp), %xmm4 + pshufd $78, %xmm14, %xmm14 + movdqa %xmm3, %xmm2 + movdqa %xmm14, %xmm0 + movdqa %xmm14, %xmm12 + pcmpgtq %xmm15, %xmm0 + pblendvb %xmm0, %xmm15, %xmm12 + pblendvb %xmm0, %xmm14, %xmm15 + movdqa %xmm4, %xmm0 + pcmpgtq %xmm3, %xmm0 + movdqa %xmm4, %xmm14 + pblendvb %xmm0, %xmm3, %xmm14 + pblendvb %xmm0, %xmm4, %xmm2 + movdqa %xmm8, %xmm0 + movdqa -96(%rbp), %xmm3 + pcmpgtq %xmm6, %xmm0 + movdqa %xmm8, %xmm4 + movaps %xmm14, -64(%rbp) + movdqa %xmm2, %xmm14 + movdqa %xmm10, %xmm2 + pblendvb %xmm0, %xmm6, %xmm4 + pblendvb %xmm0, %xmm8, %xmm6 + movdqa %xmm11, %xmm0 + pcmpgtq %xmm13, %xmm0 + movdqa %xmm11, %xmm8 + pshufd $78, %xmm6, %xmm6 + pblendvb %xmm0, %xmm13, %xmm8 + pblendvb %xmm0, %xmm11, %xmm13 + movdqa %xmm10, %xmm0 + pcmpgtq %xmm3, %xmm0 + movaps %xmm8, -80(%rbp) + movdqa %xmm9, %xmm11 + pshufd $78, %xmm13, %xmm13 + pblendvb %xmm0, %xmm3, %xmm2 + movdqa %xmm2, %xmm8 + movdqa %xmm3, %xmm2 + movdqa -112(%rbp), %xmm3 + pblendvb %xmm0, %xmm10, %xmm2 + movdqa %xmm9, %xmm0 + pshufd $78, %xmm8, %xmm8 + pcmpgtq %xmm1, %xmm0 + pshufd $78, %xmm15, %xmm10 + pblendvb %xmm0, %xmm1, %xmm11 + pblendvb %xmm0, %xmm9, %xmm1 + movdqa %xmm7, %xmm0 + pcmpgtq %xmm3, %xmm0 + movdqa %xmm7, %xmm9 + movaps %xmm11, -96(%rbp) + pshufd $78, -96(%rbp), %xmm15 + pblendvb %xmm0, %xmm3, %xmm9 + pblendvb %xmm0, %xmm7, %xmm3 + movdqa %xmm5, %xmm0 + movdqa %xmm3, %xmm11 + pshufd $78, %xmm9, %xmm9 + movdqa %xmm5, %xmm7 + movdqa -160(%rbp), %xmm3 + pcmpgtq %xmm3, %xmm0 + pblendvb %xmm0, %xmm3, %xmm7 + pblendvb %xmm0, %xmm5, %xmm3 + movdqa %xmm9, %xmm0 + pcmpgtq %xmm12, %xmm0 + pshufd $78, %xmm14, %xmm5 + movdqa %xmm9, %xmm14 + pshufd $78, %xmm7, %xmm7 + pblendvb %xmm0, %xmm12, %xmm14 + pblendvb %xmm0, %xmm9, %xmm12 + movdqa %xmm8, %xmm0 + pcmpgtq %xmm4, %xmm0 + movdqa %xmm8, %xmm9 + pblendvb %xmm0, %xmm4, %xmm9 + pblendvb %xmm0, %xmm8, %xmm4 + movdqa %xmm10, %xmm0 + pcmpgtq %xmm11, %xmm0 + movdqa %xmm10, %xmm8 + movaps %xmm9, -96(%rbp) + movdqa -80(%rbp), %xmm9 + pblendvb %xmm0, %xmm11, %xmm8 + pblendvb %xmm0, %xmm10, %xmm11 + movdqa %xmm6, %xmm0 + pcmpgtq %xmm2, %xmm0 + movdqa %xmm6, %xmm10 + movaps %xmm8, -112(%rbp) + pshufd $78, %xmm11, %xmm11 + pblendvb %xmm0, %xmm2, %xmm10 + pblendvb %xmm0, %xmm6, %xmm2 + movdqa -64(%rbp), %xmm6 + movdqa %xmm7, %xmm0 + movdqa %xmm10, %xmm8 + movdqa %xmm7, %xmm10 + pcmpgtq %xmm6, %xmm0 + pshufd $78, %xmm8, %xmm8 + pblendvb %xmm0, %xmm6, %xmm10 + pblendvb %xmm0, %xmm7, %xmm6 + movdqa %xmm15, %xmm0 + pcmpgtq %xmm9, %xmm0 + movdqa %xmm15, %xmm7 + pshufd $78, %xmm6, %xmm6 + pblendvb %xmm0, %xmm9, %xmm7 + pblendvb %xmm0, %xmm15, %xmm9 + movdqa %xmm5, %xmm0 + pcmpgtq %xmm3, %xmm0 + movdqa %xmm5, %xmm15 + pshufd $78, %xmm7, %xmm7 + pblendvb %xmm0, %xmm3, %xmm15 + pblendvb %xmm0, %xmm5, %xmm3 + movdqa %xmm13, %xmm0 + pcmpgtq %xmm1, %xmm0 + movdqa %xmm13, %xmm5 + movaps %xmm15, -64(%rbp) + pshufd $78, -96(%rbp), %xmm15 + pshufd $78, %xmm3, %xmm3 + pblendvb %xmm0, %xmm1, %xmm5 + pblendvb %xmm0, %xmm13, %xmm1 + movdqa %xmm15, %xmm0 + pcmpgtq %xmm14, %xmm0 + pshufd $78, %xmm12, %xmm13 + movdqa %xmm15, %xmm12 + pshufd $78, %xmm5, %xmm5 + pblendvb %xmm0, %xmm14, %xmm12 + pblendvb %xmm0, %xmm15, %xmm14 + movdqa %xmm13, %xmm0 + pcmpgtq %xmm4, %xmm0 + movaps %xmm12, -80(%rbp) + movdqa %xmm13, %xmm15 + movdqa -112(%rbp), %xmm12 + pblendvb %xmm0, %xmm4, %xmm15 + pblendvb %xmm0, %xmm13, %xmm4 + movdqa %xmm8, %xmm0 + pcmpgtq %xmm12, %xmm0 + movdqa %xmm8, %xmm13 + pblendvb %xmm0, %xmm12, %xmm13 + pblendvb %xmm0, %xmm8, %xmm12 + movdqa %xmm11, %xmm0 + pcmpgtq %xmm2, %xmm0 + movdqa %xmm11, %xmm8 + pblendvb %xmm0, %xmm2, %xmm8 + pblendvb %xmm0, %xmm11, %xmm2 + movdqa %xmm7, %xmm0 + pcmpgtq %xmm10, %xmm0 + movdqa %xmm7, %xmm11 + pblendvb %xmm0, %xmm10, %xmm11 + pblendvb %xmm0, %xmm7, %xmm10 + movdqa %xmm6, %xmm0 + pcmpgtq %xmm9, %xmm0 + movdqa %xmm6, %xmm7 + pblendvb %xmm0, %xmm9, %xmm7 + pblendvb %xmm0, %xmm6, %xmm9 + movdqa %xmm5, %xmm0 + movdqa -64(%rbp), %xmm6 + movaps %xmm7, -160(%rbp) + movdqa %xmm5, %xmm7 + pcmpgtq %xmm6, %xmm0 + pblendvb %xmm0, %xmm6, %xmm7 + pblendvb %xmm0, %xmm5, %xmm6 + movdqa %xmm3, %xmm0 + pcmpgtq %xmm1, %xmm0 + movdqa %xmm3, %xmm5 + pblendvb %xmm0, %xmm1, %xmm5 + pblendvb %xmm0, %xmm3, %xmm1 + movaps %xmm1, -176(%rbp) + movdqa -80(%rbp), %xmm3 + pshufd $78, %xmm3, %xmm1 + movdqa %xmm1, %xmm0 + pcmpgtq %xmm3, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm1, %xmm3 + pshufd $78, %xmm14, %xmm1 + movdqa %xmm1, %xmm0 + movaps %xmm3, -80(%rbp) + movdqa -160(%rbp), %xmm3 + pcmpgtq %xmm14, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm1, %xmm14 + pshufd $78, %xmm15, %xmm1 + movdqa %xmm1, %xmm0 + movaps %xmm14, -64(%rbp) + movdqa -176(%rbp), %xmm14 + pcmpgtq %xmm15, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm1, %xmm15 + pshufd $78, %xmm4, %xmm1 + movdqa %xmm1, %xmm0 + movaps %xmm15, -96(%rbp) + pcmpgtq %xmm4, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm1, %xmm4 + pshufd $78, %xmm13, %xmm1 + movdqa %xmm1, %xmm0 + movaps %xmm4, -112(%rbp) + pcmpgtq %xmm13, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm1, %xmm13 + pshufd $78, %xmm12, %xmm1 + movdqa %xmm1, %xmm0 + movdqa %xmm13, %xmm15 + pcmpgtq %xmm12, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm1, %xmm12 + pshufd $78, %xmm8, %xmm1 + movdqa %xmm1, %xmm0 + pcmpgtq %xmm8, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm1, %xmm8 + pshufd $78, %xmm2, %xmm1 + movdqa %xmm1, %xmm0 + pcmpgtq %xmm2, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm1, %xmm2 + pshufd $78, %xmm11, %xmm1 + movdqa %xmm1, %xmm0 + pcmpgtq %xmm11, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm1, %xmm11 + pshufd $78, %xmm10, %xmm1 + movdqa %xmm1, %xmm0 + movdqa %xmm11, %xmm4 + pcmpgtq %xmm10, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm1, %xmm10 + pshufd $78, %xmm3, %xmm1 + movdqa %xmm1, %xmm0 + movdqa %xmm10, %xmm11 + pcmpgtq %xmm3, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm1, %xmm3 + movdqa %xmm3, %xmm10 + pshufd $78, %xmm9, %xmm3 + movdqa %xmm3, %xmm0 + pcmpgtq %xmm9, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm3, %xmm9 + pshufd $78, %xmm7, %xmm3 + movdqa %xmm3, %xmm0 + movdqa %xmm9, %xmm1 + pshufd $78, %xmm5, %xmm9 + pcmpgtq %xmm7, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm3, %xmm7 + pshufd $78, %xmm6, %xmm3 + movdqa %xmm3, %xmm0 + pcmpgtq %xmm6, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm3, %xmm6 + movdqa %xmm9, %xmm0 + pcmpgtq %xmm5, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm9, %xmm5 + movdqa %xmm5, %xmm3 + pshufd $78, %xmm14, %xmm5 + movdqa %xmm5, %xmm0 + pcmpgtq %xmm14, %xmm0 + punpckhqdq %xmm0, %xmm0 + pblendvb %xmm0, %xmm5, %xmm14 + movdqa %xmm14, %xmm0 +.L289: + movdqa -80(%rbp), %xmm5 + movq -136(%rbp), %rdx + movups %xmm5, (%rdx) + movdqa -64(%rbp), %xmm5 + movups %xmm5, (%r15) + movdqa -96(%rbp), %xmm5 + movups %xmm5, (%r14) + movdqa -112(%rbp), %xmm5 + movups %xmm5, 0(%r13) + movups %xmm15, (%r12) + movups %xmm12, (%rbx) + movq -128(%rbp), %rbx + movups %xmm8, (%r11) + movups %xmm2, (%r10) + movups %xmm4, (%r9) + movups %xmm11, (%r8) + movups %xmm10, (%rdi) + movups %xmm1, (%rsi) + movups %xmm7, (%rcx) + movq -120(%rbp), %rcx + movups %xmm6, (%rcx) + movups %xmm3, (%rbx) + movups %xmm0, (%rax) + subq $-128, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L291: + .cfi_restore_state + movdqa %xmm3, %xmm2 + movdqa %xmm14, %xmm12 + movdqa -176(%rbp), %xmm0 + movdqa -192(%rbp), %xmm3 + movdqa -224(%rbp), %xmm6 + movdqa %xmm13, %xmm8 + movdqa -240(%rbp), %xmm10 + jmp .L289 + .cfi_endproc +.LFE18793: + .size _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0, .-_ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + .section .text._ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, @function +_ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0: +.LFB18794: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + movq %rdi, %r14 + pushq %r13 + .cfi_offset 13, -40 + movq %rcx, %r13 + pushq %r12 + .cfi_offset 12, -48 + movq %rsi, %r12 + pushq %rbx + subq $88, %rsp + .cfi_offset 3, -56 + movq %rdx, -120(%rbp) + movaps %xmm0, -80(%rbp) + movaps %xmm1, -64(%rbp) + movaps %xmm0, -96(%rbp) + movaps %xmm1, -112(%rbp) + cmpq $1, %rsi + jbe .L315 + movl $2, %r15d + xorl %ebx, %ebx + jmp .L300 + .p2align 4,,10 + .p2align 3 +.L294: + movdqa -80(%rbp), %xmm5 + movmskpd %xmm1, %edi + movups %xmm5, (%r14,%rbx,8) + call __popcountdi2@PLT + cltq + addq %rax, %rbx + leaq 2(%r15), %rax + cmpq %r12, %rax + ja .L347 + movq %rax, %r15 +.L300: + movdqu -16(%r14,%r15,8), %xmm0 + leaq -2(%r15), %rdx + leaq 0(,%rbx,8), %rax + pcmpeqd -96(%rbp), %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm0, %xmm1 + movdqu -16(%r14,%r15,8), %xmm0 + pcmpeqd -112(%rbp), %xmm0 + pshufd $177, %xmm0, %xmm2 + pand %xmm2, %xmm0 + movdqa %xmm1, %xmm2 + por %xmm0, %xmm2 + movmskpd %xmm2, %ecx + cmpl $3, %ecx + je .L294 + pcmpeqd %xmm2, %xmm2 + movq -120(%rbp), %rsi + leaq 2(%rbx), %rdi + pxor %xmm2, %xmm0 + pandn %xmm0, %xmm1 + movmskpd %xmm1, %ecx + rep bsfl %ecx, %ecx + movslq %ecx, %rcx + addq %rdx, %rcx + movddup (%r14,%rcx,8), %xmm0 + movaps %xmm0, (%rsi) + cmpq %rdx, %rdi + ja .L295 + movq %rdx, %rcx + addq %r14, %rax + subq %rbx, %rcx + leaq -2(%rcx), %rsi + movq %rsi, %rcx + andq $-2, %rcx + addq %rbx, %rcx + leaq 16(%r14,%rcx,8), %rcx + .p2align 4,,10 + .p2align 3 +.L296: + movdqa -64(%rbp), %xmm4 + addq $16, %rax + movups %xmm4, -16(%rax) + cmpq %rcx, %rax + jne .L296 + andq $-2, %rsi + leaq (%rsi,%rdi), %rbx +.L295: + subq %rbx, %rdx + movdqa .LC1(%rip), %xmm2 + movdqa .LC0(%rip), %xmm1 + leaq 0(,%rbx,8), %rcx + movq %rdx, %xmm0 + punpcklqdq %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + psubq %xmm0, %xmm1 + pcmpeqd %xmm2, %xmm3 + pcmpgtd %xmm2, %xmm0 + pand %xmm3, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L297 + movdqa -64(%rbp), %xmm3 + movq %xmm3, (%r14,%rbx,8) +.L297: + movhlps %xmm0, %xmm3 + movq %xmm3, %rax + testq %rax, %rax + jne .L348 +.L308: + addq $88, %rsp + xorl %eax, %eax + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L348: + .cfi_restore_state + movdqa -64(%rbp), %xmm3 + movhps %xmm3, 8(%r14,%rcx) + jmp .L308 + .p2align 4,,10 + .p2align 3 +.L347: + movq %r12, %rcx + leaq 0(,%r15,8), %rsi + leaq 0(,%rbx,8), %r9 + subq %r15, %rcx +.L293: + testq %rcx, %rcx + je .L304 + leaq 0(,%rcx,8), %rdx + addq %r14, %rsi + movq %r13, %rdi + movq %r9, -112(%rbp) + movq %rcx, -96(%rbp) + call memcpy@PLT + movq -96(%rbp), %rcx + movq -112(%rbp), %r9 +.L304: + movdqa .LC1(%rip), %xmm3 + movq %rcx, %xmm2 + movdqa -80(%rbp), %xmm5 + movdqa .LC0(%rip), %xmm1 + punpcklqdq %xmm2, %xmm2 + movdqa %xmm3, %xmm4 + pcmpeqd %xmm2, %xmm4 + movdqa %xmm1, %xmm0 + psubq %xmm2, %xmm0 + pcmpgtd %xmm3, %xmm2 + pand %xmm4, %xmm0 + por %xmm2, %xmm0 + movdqa 0(%r13), %xmm2 + pshufd $245, %xmm0, %xmm0 + pcmpeqd %xmm2, %xmm5 + pcmpeqd -64(%rbp), %xmm2 + pshufd $177, %xmm5, %xmm4 + pand %xmm0, %xmm4 + pand %xmm5, %xmm4 + pshufd $177, %xmm2, %xmm5 + pand %xmm5, %xmm2 + pcmpeqd %xmm5, %xmm5 + movdqa %xmm5, %xmm6 + pxor %xmm0, %xmm6 + por %xmm6, %xmm2 + por %xmm4, %xmm2 + movmskpd %xmm2, %eax + cmpl $3, %eax + jne .L349 + movq %xmm0, %rax + testq %rax, %rax + je .L309 + movdqa -80(%rbp), %xmm5 + movq %xmm5, (%r14,%r9) +.L309: + movhlps %xmm0, %xmm6 + movq %xmm6, %rax + testq %rax, %rax + jne .L350 +.L310: + movmskpd %xmm4, %edi + call __popcountdi2@PLT + movdqa .LC0(%rip), %xmm1 + movdqa .LC1(%rip), %xmm3 + movslq %eax, %rdx + addq %rbx, %rdx + leaq 2(%rdx), %rax + cmpq %rax, %r12 + jb .L311 + .p2align 4,,10 + .p2align 3 +.L312: + movdqa -64(%rbp), %xmm7 + movq %rax, %rdx + movups %xmm7, -16(%r14,%rax,8) + addq $2, %rax + cmpq %rax, %r12 + jnb .L312 +.L311: + subq %rdx, %r12 + movdqa %xmm3, %xmm2 + leaq 0(,%rdx,8), %rcx + movq %r12, %xmm0 + punpcklqdq %xmm0, %xmm0 + pcmpeqd %xmm0, %xmm2 + psubq %xmm0, %xmm1 + pcmpgtd %xmm3, %xmm0 + pand %xmm2, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L313 + movdqa -64(%rbp), %xmm3 + movq %xmm3, (%r14,%rdx,8) +.L313: + movhlps %xmm0, %xmm3 + movq %xmm3, %rax + testq %rax, %rax + je .L314 + movdqa -64(%rbp), %xmm3 + movhps %xmm3, 8(%r14,%rcx) +.L314: + addq $88, %rsp + movl $1, %eax + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L350: + .cfi_restore_state + movdqa -80(%rbp), %xmm6 + movhps %xmm6, 8(%r14,%r9) + jmp .L310 +.L315: + movq %rsi, %rcx + xorl %r9d, %r9d + xorl %esi, %esi + xorl %ebx, %ebx + xorl %r15d, %r15d + jmp .L293 +.L349: + pxor %xmm5, %xmm2 + leaq 2(%rbx), %rsi + movmskpd %xmm2, %eax + rep bsfl %eax, %eax + cltq + addq %r15, %rax + movddup (%r14,%rax,8), %xmm0 + movq -120(%rbp), %rax + movaps %xmm0, (%rax) + cmpq %r15, %rsi + ja .L305 + leaq -2(%r15), %rcx + leaq (%r14,%rbx,8), %rax + subq %rbx, %rcx + movq %rcx, %rdx + andq $-2, %rdx + addq %rbx, %rdx + leaq 16(%r14,%rdx,8), %rdx + .p2align 4,,10 + .p2align 3 +.L306: + movdqa -64(%rbp), %xmm4 + addq $16, %rax + movups %xmm4, -16(%rax) + cmpq %rax, %rdx + jne .L306 + andq $-2, %rcx + leaq (%rcx,%rsi), %rbx + leaq 0(,%rbx,8), %r9 +.L305: + subq %rbx, %r15 + movdqa %xmm3, %xmm2 + movq %r15, %xmm0 + punpcklqdq %xmm0, %xmm0 + pcmpeqd %xmm0, %xmm2 + psubq %xmm0, %xmm1 + pcmpgtd %xmm3, %xmm0 + pand %xmm2, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L307 + movdqa -64(%rbp), %xmm3 + movq %xmm3, (%r14,%r9) +.L307: + movhlps %xmm0, %xmm3 + movq %xmm3, %rax + testq %rax, %rax + je .L308 + movdqa -64(%rbp), %xmm3 + movhps %xmm3, 8(%r14,%r9) + jmp .L308 + .cfi_endproc +.LFE18794: + .size _ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0, .-_ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + .section .text._ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0, @function +_ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0: +.LFB18795: + .cfi_startproc + cmpq %rdx, %rsi + jbe .L351 + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r8 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rax, %rsi + jbe .L351 + movq (%rdi,%rdx,8), %r11 + movq %r11, %xmm7 + movddup %xmm7, %xmm3 + movdqa %xmm3, %xmm6 + jmp .L354 + .p2align 4,,10 + .p2align 3 +.L360: + movq %rdx, %rax +.L355: + cmpq %rcx, %rsi + jbe .L356 + salq $4, %r8 + movddup (%rdi,%r8), %xmm0 + movdqa %xmm0, %xmm2 + psubq %xmm0, %xmm1 + pcmpeqd %xmm4, %xmm2 + pcmpgtd %xmm4, %xmm0 + pand %xmm2, %xmm1 + por %xmm0, %xmm1 + pshufd $245, %xmm1, %xmm1 + movmskpd %xmm1, %r8d + andl $1, %r8d + jne .L357 +.L356: + cmpq %rdx, %rax + je .L351 + leaq (%rdi,%rax,8), %rdx + movq (%rdx), %rcx + movq %rcx, (%r10) + movq %r11, (%rdx) + cmpq %rax, %rsi + jbe .L363 + movq %rax, %rdx +.L358: + leaq (%rdx,%rdx), %rcx + leaq 1(%rdx), %r8 + leaq 1(%rcx), %rax + addq $2, %rcx + cmpq %rsi, %rax + jnb .L351 +.L354: + movddup (%rdi,%rax,8), %xmm2 + movdqa %xmm2, %xmm5 + movdqa %xmm3, %xmm0 + leaq (%rdi,%rdx,8), %r10 + pcmpeqd %xmm3, %xmm5 + psubq %xmm2, %xmm0 + movdqa %xmm6, %xmm4 + movdqa %xmm3, %xmm1 + pand %xmm5, %xmm0 + movdqa %xmm2, %xmm5 + pcmpgtd %xmm3, %xmm5 + por %xmm5, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %r9d + andl $1, %r9d + je .L360 + movdqa %xmm2, %xmm1 + movdqa %xmm2, %xmm4 + jmp .L355 + .p2align 4,,10 + .p2align 3 +.L357: + cmpq %rcx, %rdx + je .L351 + leaq (%rdi,%rcx,8), %rax + movq (%rax), %rdx + movq %rdx, (%r10) + movq %rcx, %rdx + movq %r11, (%rax) + jmp .L358 + .p2align 4,,10 + .p2align 3 +.L351: + ret + .p2align 4,,10 + .p2align 3 +.L363: + ret + .cfi_endproc +.LFE18795: + .size _ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0, .-_ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + .section .text._ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0, @function +_ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0: +.LFB18796: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsi, %rax + salq $3, %rax + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + leaq (%rdi,%rax), %r15 + pushq %r14 + .cfi_offset 14, -32 + leaq (%r15,%rax), %r14 + pushq %r13 + .cfi_offset 13, -40 + leaq (%r14,%rax), %r13 + pushq %r12 + .cfi_offset 12, -48 + leaq 0(%r13,%rax), %r12 + pushq %rbx + .cfi_offset 3, -56 + leaq (%r12,%rax), %rbx + leaq (%rbx,%rax), %r11 + leaq (%r11,%rax), %r10 + subq $224, %rsp + leaq (%r10,%rax), %r9 + leaq (%r9,%rax), %r8 + movq %rdi, -232(%rbp) + movq %rsi, -192(%rbp) + movdqu (%rdi), %xmm9 + leaq (%r8,%rax), %rdi + movdqu (%r15), %xmm6 + leaq (%rdi,%rax), %rsi + movdqu 0(%r13), %xmm5 + movdqu (%r14), %xmm13 + leaq (%rsi,%rax), %rcx + movdqa %xmm9, %xmm14 + movdqu (%rbx), %xmm3 + movdqu (%r12), %xmm12 + leaq (%rcx,%rax), %rdx + psubq %xmm6, %xmm14 + movdqu (%r10), %xmm2 + movdqu (%r11), %xmm11 + movdqu (%rdx), %xmm0 + movdqu (%rsi), %xmm4 + movq %rdx, -216(%rbp) + addq %rax, %rdx + movdqu (%rdx), %xmm15 + movdqu (%r8), %xmm8 + movq %rdx, -224(%rbp) + addq %rdx, %rax + movaps %xmm0, -96(%rbp) + movdqa %xmm6, %xmm0 + movdqu (%rdi), %xmm7 + movdqu (%rcx), %xmm1 + pcmpeqd %xmm9, %xmm0 + movaps %xmm15, -112(%rbp) + movdqu (%r9), %xmm10 + pand %xmm14, %xmm0 + movdqa %xmm6, %xmm14 + pcmpgtd %xmm9, %xmm14 + por %xmm14, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm14 + pandn %xmm6, %xmm14 + pand %xmm0, %xmm6 + movdqa %xmm14, %xmm15 + movdqa %xmm9, %xmm14 + pand %xmm0, %xmm14 + por %xmm15, %xmm14 + movdqa %xmm0, %xmm15 + movdqa %xmm5, %xmm0 + pcmpeqd %xmm13, %xmm0 + pandn %xmm9, %xmm15 + movdqa %xmm13, %xmm9 + psubq %xmm5, %xmm9 + por %xmm15, %xmm6 + pand %xmm9, %xmm0 + movdqa %xmm5, %xmm9 + pcmpgtd %xmm13, %xmm9 + por %xmm9, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm9 + pandn %xmm5, %xmm9 + pand %xmm0, %xmm5 + movdqa %xmm9, %xmm15 + movdqa %xmm13, %xmm9 + pand %xmm0, %xmm9 + por %xmm15, %xmm9 + movdqa %xmm0, %xmm15 + movdqa %xmm3, %xmm0 + pcmpeqd %xmm12, %xmm0 + pandn %xmm13, %xmm15 + movdqa %xmm12, %xmm13 + psubq %xmm3, %xmm13 + por %xmm15, %xmm5 + pand %xmm13, %xmm0 + movdqa %xmm3, %xmm13 + pcmpgtd %xmm12, %xmm13 + por %xmm13, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm13 + pandn %xmm3, %xmm13 + pand %xmm0, %xmm3 + movdqa %xmm13, %xmm15 + movdqa %xmm12, %xmm13 + pand %xmm0, %xmm13 + por %xmm15, %xmm13 + movdqa %xmm0, %xmm15 + movdqa %xmm2, %xmm0 + pandn %xmm12, %xmm15 + pcmpeqd %xmm11, %xmm0 + por %xmm15, %xmm3 + movaps %xmm3, -64(%rbp) + movdqa %xmm11, %xmm3 + psubq %xmm2, %xmm3 + pand %xmm3, %xmm0 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm11, %xmm3 + por %xmm3, %xmm0 + movdqa %xmm11, %xmm3 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm12 + pand %xmm0, %xmm3 + pandn %xmm2, %xmm12 + pand %xmm0, %xmm2 + por %xmm12, %xmm3 + movdqa %xmm0, %xmm12 + movdqa %xmm8, %xmm0 + pcmpeqd %xmm10, %xmm0 + pandn %xmm11, %xmm12 + movdqa %xmm10, %xmm11 + psubq %xmm8, %xmm11 + por %xmm12, %xmm2 + movdqa %xmm10, %xmm12 + pand %xmm11, %xmm0 + movdqa %xmm8, %xmm11 + pcmpgtd %xmm10, %xmm11 + por %xmm11, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm11 + pand %xmm0, %xmm12 + pandn %xmm8, %xmm11 + pand %xmm0, %xmm8 + por %xmm11, %xmm12 + movdqa %xmm0, %xmm11 + movdqa %xmm4, %xmm0 + pandn %xmm10, %xmm11 + pcmpeqd %xmm7, %xmm0 + por %xmm11, %xmm8 + movaps %xmm8, -80(%rbp) + movdqa %xmm7, %xmm8 + movdqa -96(%rbp), %xmm10 + movdqa -112(%rbp), %xmm15 + psubq %xmm4, %xmm8 + pand %xmm8, %xmm0 + movdqa %xmm4, %xmm8 + pcmpgtd %xmm7, %xmm8 + por %xmm8, %xmm0 + movdqa %xmm7, %xmm8 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm11 + pand %xmm0, %xmm8 + pandn %xmm4, %xmm11 + pand %xmm0, %xmm4 + por %xmm11, %xmm8 + movdqa %xmm0, %xmm11 + movdqa %xmm10, %xmm0 + pcmpeqd %xmm1, %xmm0 + pandn %xmm7, %xmm11 + movdqa %xmm1, %xmm7 + psubq %xmm10, %xmm7 + por %xmm11, %xmm4 + movdqa %xmm1, %xmm11 + pand %xmm7, %xmm0 + movdqa %xmm10, %xmm7 + pcmpgtd %xmm1, %xmm7 + por %xmm7, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm7 + pand %xmm0, %xmm11 + pandn %xmm10, %xmm7 + por %xmm7, %xmm11 + movdqa %xmm0, %xmm7 + pand %xmm10, %xmm0 + movdqu (%rax), %xmm10 + pandn %xmm1, %xmm7 + movdqu (%rax), %xmm1 + por %xmm7, %xmm0 + movdqa %xmm15, %xmm7 + psubq %xmm1, %xmm7 + movdqu (%rax), %xmm1 + movaps %xmm0, -96(%rbp) + movdqa %xmm15, %xmm0 + pcmpeqd %xmm15, %xmm1 + pand %xmm7, %xmm1 + movdqu (%rax), %xmm7 + pcmpgtd %xmm15, %xmm7 + por %xmm7, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm7 + pand %xmm1, %xmm0 + pandn %xmm10, %xmm7 + por %xmm7, %xmm0 + movdqa %xmm1, %xmm7 + pand %xmm10, %xmm1 + pandn %xmm15, %xmm7 + movdqa %xmm14, %xmm10 + movdqa %xmm14, %xmm15 + por %xmm7, %xmm1 + movdqa %xmm9, %xmm7 + psubq %xmm9, %xmm10 + pcmpeqd %xmm14, %xmm7 + pand %xmm10, %xmm7 + movdqa %xmm9, %xmm10 + pcmpgtd %xmm14, %xmm10 + por %xmm10, %xmm7 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm10 + pand %xmm7, %xmm15 + pandn %xmm9, %xmm10 + pand %xmm7, %xmm9 + por %xmm15, %xmm10 + movdqa %xmm7, %xmm15 + movdqa %xmm5, %xmm7 + pcmpeqd %xmm6, %xmm7 + pandn %xmm14, %xmm15 + movdqa %xmm6, %xmm14 + por %xmm9, %xmm15 + movdqa %xmm6, %xmm9 + psubq %xmm5, %xmm9 + pand %xmm9, %xmm7 + movdqa %xmm5, %xmm9 + pcmpgtd %xmm6, %xmm9 + por %xmm9, %xmm7 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm9 + pand %xmm7, %xmm14 + pandn %xmm5, %xmm9 + pand %xmm7, %xmm5 + por %xmm14, %xmm9 + movdqa %xmm7, %xmm14 + pandn %xmm6, %xmm14 + movdqa %xmm3, %xmm6 + por %xmm14, %xmm5 + pcmpeqd %xmm13, %xmm6 + movdqa -64(%rbp), %xmm14 + movaps %xmm5, -112(%rbp) + movdqa %xmm13, %xmm5 + psubq %xmm3, %xmm5 + pand %xmm5, %xmm6 + movdqa %xmm3, %xmm5 + pcmpgtd %xmm13, %xmm5 + por %xmm5, %xmm6 + movdqa %xmm13, %xmm5 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm7 + pand %xmm6, %xmm5 + pandn %xmm3, %xmm7 + pand %xmm6, %xmm3 + por %xmm7, %xmm5 + movdqa %xmm6, %xmm7 + movdqa %xmm14, %xmm6 + pandn %xmm13, %xmm7 + pcmpeqd %xmm2, %xmm6 + por %xmm7, %xmm3 + movdqa %xmm14, %xmm7 + psubq %xmm2, %xmm7 + pand %xmm7, %xmm6 + movdqa %xmm2, %xmm7 + pcmpgtd %xmm14, %xmm7 + por %xmm7, %xmm6 + movdqa %xmm14, %xmm7 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm13 + pand %xmm6, %xmm7 + pandn %xmm2, %xmm13 + pand %xmm6, %xmm2 + por %xmm13, %xmm7 + movdqa %xmm6, %xmm13 + movdqa %xmm8, %xmm6 + pandn %xmm14, %xmm13 + pcmpeqd %xmm12, %xmm6 + movdqa %xmm12, %xmm14 + por %xmm13, %xmm2 + movdqa %xmm12, %xmm13 + psubq %xmm8, %xmm13 + pand %xmm13, %xmm6 + movdqa %xmm8, %xmm13 + pcmpgtd %xmm12, %xmm13 + por %xmm13, %xmm6 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm13 + pand %xmm6, %xmm14 + pandn %xmm8, %xmm13 + pand %xmm6, %xmm8 + por %xmm14, %xmm13 + movdqa %xmm6, %xmm14 + movdqa %xmm8, %xmm6 + pandn %xmm12, %xmm14 + por %xmm14, %xmm6 + movdqa -80(%rbp), %xmm14 + movaps %xmm6, -128(%rbp) + movdqa %xmm14, %xmm6 + movdqa %xmm14, %xmm8 + movdqa %xmm14, %xmm12 + pcmpeqd %xmm4, %xmm6 + psubq %xmm4, %xmm8 + pand %xmm8, %xmm6 + movdqa %xmm4, %xmm8 + pcmpgtd %xmm14, %xmm8 + por %xmm8, %xmm6 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm8 + pand %xmm6, %xmm12 + pandn %xmm4, %xmm8 + pand %xmm6, %xmm4 + por %xmm12, %xmm8 + movdqa %xmm6, %xmm12 + movdqa %xmm0, %xmm6 + pandn %xmm14, %xmm12 + pcmpeqd %xmm11, %xmm6 + movdqa -96(%rbp), %xmm14 + por %xmm12, %xmm4 + movaps %xmm4, -80(%rbp) + movdqa %xmm11, %xmm4 + psubq %xmm0, %xmm4 + pand %xmm4, %xmm6 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm11, %xmm4 + por %xmm4, %xmm6 + movdqa %xmm11, %xmm4 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm12 + pand %xmm6, %xmm4 + pandn %xmm0, %xmm12 + pand %xmm6, %xmm0 + por %xmm12, %xmm4 + movdqa %xmm6, %xmm12 + movdqa %xmm14, %xmm6 + pandn %xmm11, %xmm12 + movdqa %xmm14, %xmm11 + psubq %xmm1, %xmm6 + pcmpeqd %xmm1, %xmm11 + por %xmm12, %xmm0 + pand %xmm6, %xmm11 + movdqa %xmm1, %xmm6 + pcmpgtd %xmm14, %xmm6 + por %xmm6, %xmm11 + movdqa %xmm14, %xmm6 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm6 + pandn %xmm1, %xmm12 + pand %xmm11, %xmm1 + por %xmm12, %xmm6 + movdqa %xmm11, %xmm12 + movdqa %xmm5, %xmm11 + pandn %xmm14, %xmm12 + pcmpeqd %xmm10, %xmm11 + movdqa %xmm10, %xmm14 + por %xmm12, %xmm1 + movdqa %xmm10, %xmm12 + psubq %xmm5, %xmm12 + pand %xmm12, %xmm11 + movdqa %xmm5, %xmm12 + pcmpgtd %xmm10, %xmm12 + por %xmm12, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm14 + pandn %xmm5, %xmm12 + pand %xmm11, %xmm5 + por %xmm14, %xmm12 + movdqa %xmm11, %xmm14 + movdqa %xmm9, %xmm11 + pandn %xmm10, %xmm14 + movdqa %xmm9, %xmm10 + por %xmm5, %xmm14 + movdqa %xmm7, %xmm5 + psubq %xmm7, %xmm10 + pcmpeqd %xmm9, %xmm5 + pand %xmm10, %xmm5 + movdqa %xmm7, %xmm10 + pcmpgtd %xmm9, %xmm10 + por %xmm10, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm10 + pand %xmm5, %xmm11 + pandn %xmm7, %xmm10 + pand %xmm5, %xmm7 + por %xmm11, %xmm10 + movdqa %xmm5, %xmm11 + movdqa %xmm3, %xmm5 + pandn %xmm9, %xmm11 + pcmpeqd %xmm15, %xmm5 + movdqa %xmm15, %xmm9 + por %xmm11, %xmm7 + movaps %xmm7, -96(%rbp) + movdqa %xmm15, %xmm7 + psubq %xmm3, %xmm7 + pand %xmm7, %xmm5 + movdqa %xmm3, %xmm7 + pcmpgtd %xmm15, %xmm7 + por %xmm7, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm7 + pand %xmm5, %xmm9 + pandn %xmm3, %xmm7 + pand %xmm5, %xmm3 + por %xmm9, %xmm7 + movdqa %xmm5, %xmm9 + pandn %xmm15, %xmm9 + movdqa -112(%rbp), %xmm15 + por %xmm9, %xmm3 + movaps %xmm3, -144(%rbp) + movdqa %xmm15, %xmm3 + movdqa %xmm15, %xmm5 + movdqa %xmm15, %xmm9 + pcmpeqd %xmm2, %xmm3 + psubq %xmm2, %xmm5 + pand %xmm5, %xmm3 + movdqa %xmm2, %xmm5 + pcmpgtd %xmm15, %xmm5 + por %xmm5, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm5 + pand %xmm3, %xmm9 + pandn %xmm2, %xmm5 + pand %xmm3, %xmm2 + por %xmm9, %xmm5 + movdqa %xmm3, %xmm9 + movdqa %xmm13, %xmm3 + pandn %xmm15, %xmm9 + psubq %xmm4, %xmm3 + movdqa -128(%rbp), %xmm15 + por %xmm9, %xmm2 + movaps %xmm2, -64(%rbp) + movdqa %xmm4, %xmm2 + pcmpeqd %xmm13, %xmm2 + pand %xmm3, %xmm2 + movdqa %xmm4, %xmm3 + pcmpgtd %xmm13, %xmm3 + por %xmm3, %xmm2 + movdqa %xmm13, %xmm3 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm9 + pand %xmm2, %xmm3 + pandn %xmm4, %xmm9 + pand %xmm2, %xmm4 + por %xmm9, %xmm3 + movdqa %xmm2, %xmm9 + movdqa %xmm6, %xmm2 + pandn %xmm13, %xmm9 + pcmpeqd %xmm8, %xmm2 + por %xmm9, %xmm4 + movdqa %xmm8, %xmm9 + psubq %xmm6, %xmm9 + pand %xmm9, %xmm2 + movdqa %xmm6, %xmm9 + pcmpgtd %xmm8, %xmm9 + por %xmm9, %xmm2 + movdqa %xmm8, %xmm9 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm11 + pand %xmm2, %xmm9 + pandn %xmm6, %xmm11 + pand %xmm2, %xmm6 + por %xmm11, %xmm9 + movdqa %xmm2, %xmm11 + movdqa %xmm15, %xmm2 + pandn %xmm8, %xmm11 + movdqa %xmm15, %xmm8 + psubq %xmm0, %xmm2 + pcmpeqd %xmm0, %xmm8 + por %xmm11, %xmm6 + pand %xmm2, %xmm8 + movdqa %xmm0, %xmm2 + pcmpgtd %xmm15, %xmm2 + por %xmm2, %xmm8 + movdqa %xmm15, %xmm2 + pshufd $245, %xmm8, %xmm8 + movdqa %xmm8, %xmm11 + pand %xmm8, %xmm2 + pandn %xmm0, %xmm11 + pand %xmm8, %xmm0 + por %xmm11, %xmm2 + movdqa %xmm8, %xmm11 + pandn %xmm15, %xmm11 + movdqa -80(%rbp), %xmm15 + por %xmm11, %xmm0 + movdqa %xmm15, %xmm11 + movdqa %xmm15, %xmm8 + pcmpeqd %xmm1, %xmm11 + psubq %xmm1, %xmm8 + pand %xmm8, %xmm11 + movdqa %xmm1, %xmm8 + pcmpgtd %xmm15, %xmm8 + por %xmm8, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm8 + pandn %xmm1, %xmm8 + pand %xmm11, %xmm1 + movdqa %xmm8, %xmm13 + movdqa %xmm15, %xmm8 + pand %xmm11, %xmm8 + por %xmm13, %xmm8 + movdqa %xmm11, %xmm13 + movdqa %xmm3, %xmm11 + pcmpeqd %xmm12, %xmm11 + pandn %xmm15, %xmm13 + movdqa %xmm12, %xmm15 + psubq %xmm3, %xmm15 + por %xmm13, %xmm1 + pand %xmm15, %xmm11 + movdqa %xmm3, %xmm15 + pcmpgtd %xmm12, %xmm15 + por %xmm15, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm15 + pandn %xmm3, %xmm15 + pand %xmm11, %xmm3 + movdqa %xmm15, %xmm13 + movdqa %xmm12, %xmm15 + pand %xmm11, %xmm15 + por %xmm13, %xmm15 + movaps %xmm15, -112(%rbp) + movdqa %xmm11, %xmm15 + movdqa %xmm10, %xmm11 + pandn %xmm12, %xmm15 + psubq %xmm9, %xmm11 + movdqa %xmm10, %xmm12 + movdqa %xmm15, %xmm13 + movdqa %xmm3, %xmm15 + movdqa %xmm9, %xmm3 + pcmpeqd %xmm10, %xmm3 + por %xmm13, %xmm15 + movdqa -96(%rbp), %xmm13 + pand %xmm11, %xmm3 + movdqa %xmm9, %xmm11 + pcmpgtd %xmm10, %xmm11 + por %xmm11, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm11 + pand %xmm3, %xmm12 + pandn %xmm9, %xmm11 + pand %xmm3, %xmm9 + por %xmm11, %xmm12 + movdqa %xmm3, %xmm11 + movdqa %xmm7, %xmm3 + pandn %xmm10, %xmm11 + movdqa %xmm2, %xmm10 + psubq %xmm2, %xmm3 + movaps %xmm12, -80(%rbp) + pcmpeqd %xmm7, %xmm10 + por %xmm11, %xmm9 + pand %xmm3, %xmm10 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm7, %xmm3 + por %xmm3, %xmm10 + movdqa %xmm7, %xmm3 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm11 + pand %xmm10, %xmm3 + pandn %xmm2, %xmm11 + pand %xmm10, %xmm2 + por %xmm11, %xmm3 + movdqa %xmm10, %xmm11 + movdqa %xmm8, %xmm10 + pcmpeqd %xmm5, %xmm10 + pandn %xmm7, %xmm11 + movdqa %xmm5, %xmm7 + psubq %xmm8, %xmm7 + por %xmm11, %xmm2 + pand %xmm7, %xmm10 + movdqa %xmm8, %xmm7 + pcmpgtd %xmm5, %xmm7 + por %xmm7, %xmm10 + movdqa %xmm5, %xmm7 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm11 + pand %xmm10, %xmm7 + pandn %xmm8, %xmm11 + pand %xmm10, %xmm8 + por %xmm11, %xmm7 + movdqa %xmm10, %xmm11 + movdqa %xmm4, %xmm10 + pcmpeqd %xmm14, %xmm10 + pandn %xmm5, %xmm11 + movdqa %xmm14, %xmm5 + psubq %xmm4, %xmm5 + por %xmm11, %xmm8 + pand %xmm5, %xmm10 + movdqa %xmm4, %xmm5 + pcmpgtd %xmm14, %xmm5 + por %xmm5, %xmm10 + movdqa %xmm14, %xmm5 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm11 + pand %xmm10, %xmm5 + pandn %xmm4, %xmm11 + pand %xmm10, %xmm4 + por %xmm11, %xmm5 + movdqa %xmm10, %xmm11 + movdqa %xmm13, %xmm10 + pandn %xmm14, %xmm11 + psubq %xmm6, %xmm10 + movdqa -144(%rbp), %xmm14 + por %xmm11, %xmm4 + movdqa %xmm13, %xmm11 + pcmpeqd %xmm6, %xmm11 + pand %xmm10, %xmm11 + movdqa %xmm6, %xmm10 + pcmpgtd %xmm13, %xmm10 + por %xmm10, %xmm11 + movdqa %xmm13, %xmm10 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm12 + pand %xmm11, %xmm10 + pandn %xmm6, %xmm12 + pand %xmm11, %xmm6 + por %xmm12, %xmm10 + movdqa %xmm11, %xmm12 + movdqa %xmm14, %xmm11 + pandn %xmm13, %xmm12 + pcmpeqd %xmm0, %xmm11 + por %xmm12, %xmm6 + movdqa %xmm14, %xmm12 + psubq %xmm0, %xmm12 + pand %xmm12, %xmm11 + movdqa %xmm0, %xmm12 + pcmpgtd %xmm14, %xmm12 + por %xmm12, %xmm11 + movdqa %xmm14, %xmm12 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm13 + pand %xmm11, %xmm12 + pandn %xmm0, %xmm13 + pand %xmm11, %xmm0 + por %xmm13, %xmm12 + movdqa %xmm11, %xmm13 + pandn %xmm14, %xmm13 + movdqa -64(%rbp), %xmm14 + por %xmm13, %xmm0 + movdqa %xmm14, %xmm11 + movdqa %xmm14, %xmm13 + movaps %xmm0, -96(%rbp) + pcmpeqd %xmm1, %xmm11 + psubq %xmm1, %xmm13 + pand %xmm13, %xmm11 + movdqa %xmm1, %xmm13 + pcmpgtd %xmm14, %xmm13 + por %xmm13, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm13 + movdqa %xmm11, %xmm0 + pandn -64(%rbp), %xmm0 + pandn %xmm1, %xmm13 + pand %xmm11, %xmm1 + pand %xmm11, %xmm14 + por %xmm0, %xmm1 + movdqa %xmm10, %xmm11 + por %xmm14, %xmm13 + movaps %xmm1, -144(%rbp) + movdqa %xmm2, %xmm1 + psubq %xmm2, %xmm11 + movdqa %xmm7, %xmm0 + pcmpeqd %xmm10, %xmm1 + psubq %xmm4, %xmm0 + pand %xmm11, %xmm1 + movdqa %xmm2, %xmm11 + pcmpgtd %xmm10, %xmm11 + por %xmm11, %xmm1 + movdqa %xmm10, %xmm11 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm14 + pand %xmm1, %xmm11 + pandn %xmm2, %xmm14 + pand %xmm1, %xmm2 + por %xmm14, %xmm11 + movdqa %xmm1, %xmm14 + movdqa %xmm12, %xmm1 + pandn %xmm10, %xmm14 + movdqa %xmm9, %xmm10 + psubq %xmm9, %xmm1 + pcmpeqd %xmm12, %xmm10 + por %xmm14, %xmm2 + pand %xmm1, %xmm10 + movdqa %xmm9, %xmm1 + pcmpgtd %xmm12, %xmm1 + por %xmm1, %xmm10 + movdqa %xmm12, %xmm1 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm14 + pand %xmm10, %xmm1 + pandn %xmm9, %xmm14 + pand %xmm10, %xmm9 + por %xmm14, %xmm1 + movdqa %xmm10, %xmm14 + movdqa %xmm4, %xmm10 + pcmpeqd %xmm7, %xmm10 + pandn %xmm12, %xmm14 + por %xmm14, %xmm9 + movdqa %xmm7, %xmm14 + pand %xmm0, %xmm10 + movdqa %xmm4, %xmm0 + pcmpgtd %xmm7, %xmm0 + por %xmm0, %xmm10 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm0 + pand %xmm10, %xmm14 + pandn %xmm4, %xmm0 + pand %xmm10, %xmm4 + por %xmm0, %xmm14 + movdqa %xmm10, %xmm0 + pandn %xmm7, %xmm0 + movdqa %xmm13, %xmm7 + por %xmm0, %xmm4 + psubq %xmm8, %xmm7 + movaps %xmm4, -64(%rbp) + movdqa %xmm8, %xmm4 + pcmpeqd %xmm13, %xmm4 + pand %xmm7, %xmm4 + movdqa %xmm8, %xmm7 + pcmpgtd %xmm13, %xmm7 + por %xmm7, %xmm4 + movdqa %xmm13, %xmm7 + pshufd $245, %xmm4, %xmm4 + movdqa %xmm4, %xmm10 + pand %xmm4, %xmm7 + pandn %xmm8, %xmm10 + pand %xmm4, %xmm8 + por %xmm10, %xmm7 + movdqa %xmm4, %xmm10 + pandn %xmm13, %xmm10 + movdqa -96(%rbp), %xmm13 + por %xmm10, %xmm8 + movdqa %xmm6, %xmm10 + movdqa %xmm13, %xmm4 + psubq %xmm13, %xmm10 + pcmpeqd %xmm6, %xmm4 + pand %xmm10, %xmm4 + movdqa %xmm13, %xmm10 + pcmpgtd %xmm6, %xmm10 + por %xmm10, %xmm4 + movdqa %xmm6, %xmm10 + pshufd $245, %xmm4, %xmm4 + movdqa %xmm4, %xmm0 + pand %xmm4, %xmm10 + pandn %xmm13, %xmm0 + por %xmm0, %xmm10 + movdqa %xmm4, %xmm0 + pandn %xmm6, %xmm0 + movdqa %xmm15, %xmm6 + movdqa %xmm0, %xmm12 + pcmpeqd %xmm5, %xmm6 + movdqa %xmm13, %xmm0 + movdqa -80(%rbp), %xmm13 + pand %xmm4, %xmm0 + movdqa %xmm5, %xmm4 + psubq %xmm15, %xmm4 + por %xmm12, %xmm0 + pand %xmm4, %xmm6 + movdqa %xmm15, %xmm4 + pcmpgtd %xmm5, %xmm4 + por %xmm4, %xmm6 + movdqa %xmm5, %xmm4 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm12 + pand %xmm6, %xmm4 + pandn %xmm15, %xmm12 + pand %xmm6, %xmm15 + por %xmm12, %xmm4 + movdqa %xmm6, %xmm12 + movdqa %xmm13, %xmm6 + pandn %xmm5, %xmm12 + movdqa %xmm13, %xmm5 + psubq %xmm3, %xmm6 + pcmpeqd %xmm3, %xmm5 + por %xmm12, %xmm15 + pand %xmm6, %xmm5 + movdqa %xmm3, %xmm6 + pcmpgtd %xmm13, %xmm6 + por %xmm6, %xmm5 + movdqa %xmm13, %xmm6 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm12 + pand %xmm5, %xmm6 + pandn %xmm3, %xmm12 + pand %xmm5, %xmm3 + por %xmm12, %xmm6 + movdqa %xmm5, %xmm12 + movdqa %xmm4, %xmm5 + pandn %xmm13, %xmm12 + pcmpeqd %xmm6, %xmm5 + movdqa %xmm6, %xmm13 + por %xmm12, %xmm3 + movdqa %xmm6, %xmm12 + psubq %xmm4, %xmm12 + pand %xmm12, %xmm5 + movdqa %xmm4, %xmm12 + pcmpgtd %xmm6, %xmm12 + por %xmm12, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm12 + pand %xmm5, %xmm13 + pandn %xmm4, %xmm12 + pand %xmm5, %xmm4 + por %xmm12, %xmm13 + movdqa %xmm5, %xmm12 + movdqa %xmm10, %xmm5 + pcmpeqd %xmm7, %xmm5 + pandn %xmm6, %xmm12 + movdqa %xmm7, %xmm6 + movaps %xmm13, -128(%rbp) + psubq %xmm10, %xmm6 + por %xmm12, %xmm4 + pand %xmm6, %xmm5 + movdqa %xmm10, %xmm6 + pcmpgtd %xmm7, %xmm6 + por %xmm6, %xmm5 + movdqa %xmm7, %xmm6 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm12 + pand %xmm5, %xmm6 + pandn %xmm10, %xmm12 + pand %xmm5, %xmm10 + por %xmm12, %xmm6 + movdqa %xmm5, %xmm12 + movdqa %xmm15, %xmm5 + pcmpeqd %xmm3, %xmm5 + pandn %xmm7, %xmm12 + movdqa %xmm3, %xmm7 + psubq %xmm15, %xmm7 + por %xmm12, %xmm10 + pand %xmm7, %xmm5 + movdqa %xmm15, %xmm7 + pcmpgtd %xmm3, %xmm7 + por %xmm7, %xmm5 + movdqa %xmm3, %xmm7 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm12 + pand %xmm5, %xmm7 + pandn %xmm15, %xmm12 + pand %xmm5, %xmm15 + por %xmm12, %xmm7 + movdqa %xmm5, %xmm12 + movdqa %xmm15, %xmm5 + pandn %xmm3, %xmm12 + movdqa %xmm8, %xmm3 + por %xmm12, %xmm5 + movdqa %xmm0, %xmm12 + psubq %xmm0, %xmm3 + pcmpeqd %xmm8, %xmm12 + pand %xmm3, %xmm12 + movdqa %xmm0, %xmm3 + pcmpgtd %xmm8, %xmm3 + por %xmm3, %xmm12 + movdqa %xmm8, %xmm3 + pshufd $245, %xmm12, %xmm12 + movdqa %xmm12, %xmm13 + pand %xmm12, %xmm3 + pandn %xmm0, %xmm13 + pand %xmm12, %xmm0 + por %xmm13, %xmm3 + movdqa %xmm12, %xmm13 + movdqa %xmm7, %xmm12 + pandn %xmm8, %xmm13 + movdqa %xmm7, %xmm8 + por %xmm13, %xmm0 + psubq %xmm4, %xmm8 + movaps %xmm0, -160(%rbp) + movdqa %xmm4, %xmm0 + pcmpeqd %xmm7, %xmm0 + pand %xmm8, %xmm0 + movdqa %xmm4, %xmm8 + pcmpgtd %xmm7, %xmm8 + por %xmm8, %xmm0 + pshufd $245, %xmm0, %xmm0 + pand %xmm0, %xmm12 + movdqa %xmm0, %xmm8 + pandn %xmm4, %xmm8 + movdqa %xmm12, %xmm15 + pand %xmm0, %xmm4 + por %xmm8, %xmm15 + movdqa %xmm0, %xmm8 + movdqa %xmm11, %xmm0 + pandn %xmm7, %xmm8 + movdqa %xmm1, %xmm7 + psubq %xmm1, %xmm0 + pcmpeqd %xmm11, %xmm7 + por %xmm8, %xmm4 + pand %xmm0, %xmm7 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm11, %xmm0 + por %xmm0, %xmm7 + movdqa %xmm11, %xmm0 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm8 + pand %xmm7, %xmm0 + pandn %xmm1, %xmm8 + pand %xmm7, %xmm1 + por %xmm8, %xmm0 + movdqa %xmm7, %xmm8 + movdqa %xmm9, %xmm7 + pandn %xmm11, %xmm8 + psubq %xmm2, %xmm7 + por %xmm8, %xmm1 + movdqa %xmm1, %xmm11 + movdqa %xmm2, %xmm1 + pcmpeqd %xmm9, %xmm1 + pand %xmm7, %xmm1 + movdqa %xmm2, %xmm7 + pcmpgtd %xmm9, %xmm7 + por %xmm7, %xmm1 + movdqa %xmm9, %xmm7 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm8 + pand %xmm1, %xmm7 + pandn %xmm2, %xmm8 + pand %xmm1, %xmm2 + por %xmm8, %xmm7 + movdqa %xmm1, %xmm8 + movdqa %xmm10, %xmm1 + pandn %xmm9, %xmm8 + pcmpeqd %xmm3, %xmm1 + por %xmm8, %xmm2 + movdqa %xmm3, %xmm8 + psubq %xmm10, %xmm8 + pand %xmm8, %xmm1 + movdqa %xmm10, %xmm8 + pcmpgtd %xmm3, %xmm8 + por %xmm8, %xmm1 + movdqa %xmm3, %xmm8 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm13 + pand %xmm1, %xmm8 + pandn %xmm10, %xmm13 + pand %xmm1, %xmm10 + por %xmm13, %xmm8 + movdqa %xmm1, %xmm13 + movdqa %xmm14, %xmm1 + pandn %xmm3, %xmm13 + movdqa %xmm5, %xmm3 + psubq %xmm5, %xmm1 + pcmpeqd %xmm14, %xmm3 + por %xmm13, %xmm10 + movdqa %xmm14, %xmm13 + movaps %xmm10, -176(%rbp) + movdqa -64(%rbp), %xmm9 + movdqa %xmm6, %xmm10 + cmpq $1, -192(%rbp) + pand %xmm1, %xmm3 + movdqa %xmm5, %xmm1 + pcmpgtd %xmm14, %xmm1 + por %xmm1, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm1 + pand %xmm3, %xmm13 + pandn %xmm5, %xmm1 + pand %xmm3, %xmm5 + por %xmm1, %xmm13 + movdqa %xmm3, %xmm1 + movdqa %xmm6, %xmm3 + pandn %xmm14, %xmm1 + psubq %xmm9, %xmm3 + movdqa %xmm13, %xmm12 + por %xmm1, %xmm5 + movdqa %xmm9, %xmm1 + pcmpeqd %xmm6, %xmm1 + pand %xmm3, %xmm1 + movdqa %xmm9, %xmm3 + pcmpgtd %xmm6, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm1, %xmm10 + pandn %xmm9, %xmm3 + por %xmm3, %xmm10 + movdqa %xmm1, %xmm3 + pand %xmm9, %xmm1 + movdqa %xmm0, %xmm9 + pandn %xmm6, %xmm3 + pcmpeqd %xmm13, %xmm9 + por %xmm3, %xmm1 + movdqa %xmm13, %xmm3 + psubq %xmm0, %xmm3 + pand %xmm3, %xmm9 + movdqa %xmm0, %xmm3 + pcmpgtd %xmm13, %xmm3 + por %xmm3, %xmm9 + pshufd $245, %xmm9, %xmm9 + movdqa %xmm9, %xmm3 + pand %xmm9, %xmm12 + pandn %xmm0, %xmm3 + por %xmm3, %xmm12 + movdqa %xmm9, %xmm3 + pand %xmm0, %xmm9 + pandn %xmm13, %xmm3 + movdqa %xmm11, %xmm0 + por %xmm3, %xmm9 + movdqa %xmm5, %xmm3 + psubq %xmm5, %xmm0 + pcmpeqd %xmm11, %xmm3 + pand %xmm0, %xmm3 + movdqa %xmm5, %xmm0 + pcmpgtd %xmm11, %xmm0 + por %xmm0, %xmm3 + movdqa %xmm11, %xmm0 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm6 + pand %xmm3, %xmm0 + pandn %xmm5, %xmm6 + por %xmm6, %xmm0 + movdqa %xmm3, %xmm6 + pand %xmm5, %xmm3 + movdqa %xmm7, %xmm5 + pandn %xmm11, %xmm6 + movdqa %xmm10, %xmm11 + pcmpeqd %xmm10, %xmm5 + por %xmm6, %xmm3 + movdqa %xmm10, %xmm6 + psubq %xmm7, %xmm6 + pand %xmm6, %xmm5 + movdqa %xmm7, %xmm6 + pcmpgtd %xmm10, %xmm6 + por %xmm6, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm6 + pand %xmm5, %xmm11 + pandn %xmm7, %xmm6 + por %xmm6, %xmm11 + movdqa %xmm5, %xmm6 + pand %xmm7, %xmm5 + movdqa %xmm1, %xmm7 + pandn %xmm10, %xmm6 + pcmpeqd %xmm2, %xmm7 + por %xmm6, %xmm5 + movdqa %xmm2, %xmm6 + psubq %xmm1, %xmm6 + pand %xmm6, %xmm7 + movdqa %xmm1, %xmm6 + pcmpgtd %xmm2, %xmm6 + por %xmm6, %xmm7 + movdqa %xmm2, %xmm6 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm10 + pand %xmm7, %xmm6 + pandn %xmm1, %xmm10 + pand %xmm7, %xmm1 + por %xmm10, %xmm6 + movdqa %xmm7, %xmm10 + movdqa %xmm12, %xmm7 + pandn %xmm2, %xmm10 + movdqa %xmm4, %xmm2 + psubq %xmm4, %xmm7 + pcmpeqd %xmm12, %xmm2 + por %xmm10, %xmm1 + movdqa %xmm12, %xmm10 + pand %xmm7, %xmm2 + movdqa %xmm4, %xmm7 + pcmpgtd %xmm12, %xmm7 + por %xmm7, %xmm2 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm7 + pand %xmm2, %xmm10 + pandn %xmm4, %xmm7 + pand %xmm2, %xmm4 + por %xmm7, %xmm10 + movdqa %xmm2, %xmm7 + movdqa %xmm0, %xmm2 + pandn %xmm12, %xmm7 + pcmpeqd %xmm9, %xmm2 + por %xmm7, %xmm4 + movdqa %xmm9, %xmm7 + movaps %xmm4, -64(%rbp) + movdqa %xmm9, %xmm4 + psubq %xmm0, %xmm4 + pand %xmm4, %xmm2 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm9, %xmm4 + por %xmm4, %xmm2 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm4 + pand %xmm2, %xmm7 + pandn %xmm0, %xmm4 + pand %xmm2, %xmm0 + por %xmm4, %xmm7 + movdqa %xmm2, %xmm4 + movdqa %xmm3, %xmm2 + pandn %xmm9, %xmm4 + pcmpeqd %xmm11, %xmm2 + movaps %xmm7, -80(%rbp) + por %xmm4, %xmm0 + movdqa %xmm11, %xmm4 + psubq %xmm3, %xmm4 + pand %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pcmpgtd %xmm11, %xmm4 + por %xmm4, %xmm2 + movdqa %xmm11, %xmm4 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm7 + pand %xmm2, %xmm4 + pandn %xmm3, %xmm7 + pand %xmm2, %xmm3 + por %xmm7, %xmm4 + movdqa %xmm2, %xmm7 + movdqa %xmm5, %xmm2 + pandn %xmm11, %xmm7 + psubq %xmm6, %xmm2 + por %xmm7, %xmm3 + movdqa %xmm6, %xmm7 + pcmpeqd %xmm5, %xmm7 + pand %xmm2, %xmm7 + movdqa %xmm6, %xmm2 + pcmpgtd %xmm5, %xmm2 + por %xmm2, %xmm7 + movdqa %xmm5, %xmm2 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm9 + pand %xmm7, %xmm2 + pandn %xmm6, %xmm9 + pand %xmm7, %xmm6 + por %xmm9, %xmm2 + movdqa %xmm7, %xmm9 + movdqa %xmm8, %xmm7 + pandn %xmm5, %xmm9 + movdqa %xmm1, %xmm5 + psubq %xmm1, %xmm7 + pcmpeqd %xmm8, %xmm5 + por %xmm9, %xmm6 + pand %xmm7, %xmm5 + movdqa %xmm1, %xmm7 + pcmpgtd %xmm8, %xmm7 + por %xmm7, %xmm5 + movdqa %xmm8, %xmm7 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm9 + pand %xmm5, %xmm7 + pandn %xmm1, %xmm9 + pand %xmm5, %xmm1 + por %xmm9, %xmm7 + movdqa %xmm5, %xmm9 + movdqa %xmm4, %xmm5 + pcmpeqd %xmm0, %xmm5 + pandn %xmm8, %xmm9 + movdqa %xmm0, %xmm8 + psubq %xmm4, %xmm8 + por %xmm9, %xmm1 + movdqa %xmm0, %xmm9 + pand %xmm8, %xmm5 + movdqa %xmm4, %xmm8 + pcmpgtd %xmm0, %xmm8 + por %xmm8, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm8 + pand %xmm5, %xmm9 + pandn %xmm4, %xmm8 + pand %xmm5, %xmm4 + por %xmm8, %xmm9 + movdqa %xmm5, %xmm8 + pandn %xmm0, %xmm8 + movdqa %xmm3, %xmm0 + movaps %xmm9, -96(%rbp) + por %xmm8, %xmm4 + psubq %xmm2, %xmm0 + movdqa %xmm4, %xmm14 + movdqa %xmm2, %xmm4 + pcmpeqd %xmm3, %xmm4 + pand %xmm0, %xmm4 + movdqa %xmm2, %xmm0 + pcmpgtd %xmm3, %xmm0 + por %xmm0, %xmm4 + movdqa %xmm3, %xmm0 + pshufd $245, %xmm4, %xmm4 + movdqa %xmm4, %xmm5 + pand %xmm4, %xmm0 + pandn %xmm2, %xmm5 + pand %xmm4, %xmm2 + por %xmm5, %xmm0 + movdqa %xmm4, %xmm5 + pandn %xmm3, %xmm5 + por %xmm5, %xmm2 + jbe .L368 + movdqa -112(%rbp), %xmm12 + pshufd $78, %xmm7, %xmm7 + pshufd $78, %xmm6, %xmm6 + pshufd $78, -144(%rbp), %xmm9 + pshufd $78, %xmm1, %xmm11 + movdqa -128(%rbp), %xmm8 + pshufd $78, -160(%rbp), %xmm4 + pshufd $78, -176(%rbp), %xmm5 + movdqa %xmm12, %xmm3 + movdqa %xmm12, %xmm1 + movdqa %xmm12, %xmm13 + movaps %xmm5, -192(%rbp) + pcmpeqd %xmm9, %xmm3 + psubq %xmm9, %xmm1 + pshufd $78, %xmm2, %xmm2 + movaps %xmm11, -176(%rbp) + pshufd $78, %xmm0, %xmm0 + pand %xmm1, %xmm3 + movdqa %xmm9, %xmm1 + pcmpgtd %xmm12, %xmm1 + por %xmm1, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm1 + pand %xmm3, %xmm13 + pandn %xmm9, %xmm1 + por %xmm1, %xmm13 + movdqa %xmm3, %xmm1 + pand %xmm9, %xmm3 + pandn %xmm12, %xmm1 + movdqa %xmm4, %xmm12 + movdqa %xmm13, %xmm9 + movaps %xmm1, -256(%rbp) + movdqa %xmm8, %xmm1 + psubq %xmm4, %xmm1 + pcmpeqd %xmm8, %xmm4 + movaps %xmm12, -208(%rbp) + pand %xmm1, %xmm4 + movdqa %xmm12, %xmm1 + movdqa %xmm8, %xmm12 + pcmpgtd %xmm8, %xmm1 + por %xmm1, %xmm4 + pshufd $245, %xmm4, %xmm4 + movdqa %xmm4, %xmm1 + pandn -208(%rbp), %xmm1 + pand %xmm4, %xmm12 + por %xmm1, %xmm12 + movdqa %xmm4, %xmm1 + pandn %xmm8, %xmm1 + movdqa %xmm15, %xmm8 + psubq %xmm5, %xmm8 + movaps %xmm1, -272(%rbp) + movdqa %xmm8, %xmm1 + movdqa %xmm5, %xmm8 + pcmpeqd %xmm15, %xmm8 + pand %xmm1, %xmm8 + movdqa %xmm5, %xmm1 + movdqa %xmm15, %xmm5 + pcmpgtd %xmm15, %xmm1 + por %xmm1, %xmm8 + pshufd $245, %xmm8, %xmm8 + movdqa %xmm8, %xmm1 + pandn -192(%rbp), %xmm1 + pand %xmm8, %xmm5 + por %xmm1, %xmm5 + movdqa %xmm8, %xmm1 + pand -192(%rbp), %xmm8 + pandn %xmm15, %xmm1 + movaps %xmm5, -128(%rbp) + movdqa %xmm10, %xmm5 + movaps %xmm1, -288(%rbp) + movdqa %xmm11, %xmm1 + psubq %xmm11, %xmm5 + pcmpeqd %xmm10, %xmm1 + pand %xmm5, %xmm1 + movdqa %xmm11, %xmm5 + movdqa %xmm10, %xmm11 + pcmpgtd %xmm10, %xmm5 + por %xmm5, %xmm1 + pshufd $245, %xmm1, %xmm1 + pand %xmm1, %xmm11 + movdqa %xmm1, %xmm5 + pandn -176(%rbp), %xmm5 + movdqa %xmm11, %xmm15 + movdqa %xmm1, %xmm11 + pand -176(%rbp), %xmm1 + por %xmm5, %xmm15 + pandn %xmm10, %xmm11 + movaps %xmm15, -144(%rbp) + movdqa -64(%rbp), %xmm15 + movaps %xmm11, -304(%rbp) + por -304(%rbp), %xmm1 + movdqa %xmm15, %xmm5 + movdqa %xmm15, %xmm10 + pcmpeqd %xmm7, %xmm5 + psubq %xmm7, %xmm10 + pshufd $78, %xmm1, %xmm1 + pand %xmm10, %xmm5 + movdqa %xmm7, %xmm10 + pcmpgtd %xmm15, %xmm10 + por %xmm10, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm10 + pandn %xmm7, %xmm10 + pand %xmm5, %xmm7 + movaps %xmm10, -320(%rbp) + movdqa %xmm5, %xmm10 + pand -64(%rbp), %xmm5 + por -320(%rbp), %xmm5 + pandn %xmm15, %xmm10 + por %xmm10, %xmm7 + movdqa -80(%rbp), %xmm10 + pshufd $78, %xmm5, %xmm5 + movaps %xmm7, -112(%rbp) + movdqa %xmm10, %xmm11 + movdqa %xmm10, %xmm7 + pcmpeqd %xmm6, %xmm11 + psubq %xmm6, %xmm7 + pand %xmm7, %xmm11 + movdqa %xmm6, %xmm7 + pcmpgtd %xmm10, %xmm7 + por %xmm7, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm7 + pandn %xmm6, %xmm7 + pand %xmm11, %xmm6 + movaps %xmm7, -336(%rbp) + movdqa %xmm11, %xmm7 + pand -80(%rbp), %xmm11 + por -336(%rbp), %xmm11 + pandn %xmm10, %xmm7 + movdqa -96(%rbp), %xmm10 + por %xmm7, %xmm6 + pshufd $78, %xmm11, %xmm11 + movdqa %xmm10, %xmm7 + movaps %xmm6, -160(%rbp) + movdqa %xmm10, %xmm6 + pcmpeqd %xmm2, %xmm7 + psubq %xmm2, %xmm6 + pand %xmm6, %xmm7 + movdqa %xmm2, %xmm6 + pcmpgtd %xmm10, %xmm6 + por %xmm6, %xmm7 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm6 + pandn %xmm2, %xmm6 + pand %xmm7, %xmm2 + movaps %xmm6, -352(%rbp) + movdqa %xmm7, %xmm6 + por -288(%rbp), %xmm8 + pand -96(%rbp), %xmm7 + pandn %xmm10, %xmm6 + por -256(%rbp), %xmm3 + por -352(%rbp), %xmm7 + por %xmm6, %xmm2 + movdqa %xmm14, %xmm6 + pand -208(%rbp), %xmm4 + psubq %xmm0, %xmm6 + pshufd $78, %xmm7, %xmm7 + por -272(%rbp), %xmm4 + movdqa %xmm6, %xmm10 + movdqa %xmm0, %xmm6 + movaps %xmm7, -80(%rbp) + pshufd $78, %xmm8, %xmm8 + pcmpeqd %xmm14, %xmm6 + pshufd $78, %xmm4, %xmm4 + movaps %xmm4, -96(%rbp) + pand %xmm10, %xmm6 + movdqa %xmm0, %xmm10 + pcmpgtd %xmm14, %xmm10 + por %xmm10, %xmm6 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm10 + movdqa %xmm6, %xmm15 + pandn %xmm0, %xmm10 + pand %xmm6, %xmm0 + pand %xmm14, %xmm6 + por %xmm10, %xmm6 + pandn %xmm14, %xmm15 + movdqa -80(%rbp), %xmm14 + pshufd $78, %xmm6, %xmm6 + por %xmm15, %xmm0 + pshufd $78, %xmm3, %xmm15 + movdqa %xmm6, %xmm10 + movdqa %xmm13, %xmm3 + movdqa %xmm15, %xmm7 + pcmpeqd %xmm13, %xmm10 + pcmpeqd %xmm0, %xmm7 + psubq %xmm6, %xmm3 + pand %xmm3, %xmm10 + movdqa %xmm6, %xmm3 + pcmpgtd %xmm13, %xmm3 + por %xmm3, %xmm10 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm3 + pand %xmm10, %xmm9 + movdqa %xmm10, %xmm4 + pandn %xmm6, %xmm3 + pandn %xmm13, %xmm4 + movdqa %xmm12, %xmm13 + por %xmm3, %xmm9 + movdqa %xmm0, %xmm3 + pand %xmm6, %xmm10 + movaps %xmm4, -256(%rbp) + psubq %xmm15, %xmm3 + movdqa %xmm0, %xmm4 + pand %xmm3, %xmm7 + movdqa %xmm15, %xmm3 + pcmpgtd %xmm0, %xmm3 + por %xmm3, %xmm7 + pshufd $245, %xmm7, %xmm7 + movdqa %xmm7, %xmm3 + pand %xmm7, %xmm4 + pandn %xmm15, %xmm3 + por %xmm3, %xmm4 + movdqa %xmm7, %xmm3 + pand %xmm15, %xmm7 + pandn %xmm0, %xmm3 + movdqa %xmm12, %xmm0 + psubq %xmm14, %xmm0 + movaps %xmm3, -272(%rbp) + movdqa %xmm0, %xmm3 + movdqa %xmm14, %xmm0 + pcmpeqd %xmm12, %xmm0 + pand %xmm3, %xmm0 + movdqa %xmm14, %xmm3 + movdqa %xmm2, %xmm14 + pcmpgtd %xmm12, %xmm3 + por %xmm3, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + pandn -80(%rbp), %xmm3 + pand %xmm0, %xmm13 + movaps %xmm0, -288(%rbp) + por %xmm3, %xmm13 + movdqa %xmm0, %xmm3 + movdqa -112(%rbp), %xmm0 + movaps %xmm13, -176(%rbp) + movdqa -96(%rbp), %xmm13 + pandn %xmm12, %xmm3 + movdqa %xmm2, %xmm12 + movaps %xmm3, -304(%rbp) + movdqa %xmm13, %xmm3 + psubq %xmm13, %xmm12 + pcmpeqd %xmm2, %xmm3 + pand %xmm12, %xmm3 + movdqa %xmm13, %xmm12 + movdqa -128(%rbp), %xmm13 + pcmpgtd %xmm2, %xmm12 + por %xmm12, %xmm3 + pshufd $245, %xmm3, %xmm3 + movdqa %xmm3, %xmm12 + pandn -96(%rbp), %xmm12 + pand %xmm3, %xmm14 + por %xmm12, %xmm14 + movdqa %xmm3, %xmm12 + pandn %xmm2, %xmm12 + movdqa %xmm13, %xmm2 + movaps %xmm14, -192(%rbp) + pcmpeqd %xmm11, %xmm2 + movaps %xmm12, -320(%rbp) + movdqa %xmm13, %xmm12 + psubq %xmm11, %xmm12 + pand %xmm12, %xmm2 + movdqa %xmm11, %xmm12 + pcmpgtd %xmm13, %xmm12 + por %xmm12, %xmm2 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm14 + movdqa %xmm2, %xmm12 + pandn %xmm11, %xmm14 + pand %xmm2, %xmm11 + pandn %xmm13, %xmm12 + movdqa %xmm11, %xmm13 + movaps %xmm14, -336(%rbp) + pand -128(%rbp), %xmm2 + por -336(%rbp), %xmm2 + por %xmm12, %xmm13 + movdqa -160(%rbp), %xmm12 + movaps %xmm13, -208(%rbp) + pshufd $78, %xmm2, %xmm2 + movdqa %xmm12, %xmm14 + movdqa %xmm12, %xmm11 + pcmpeqd %xmm8, %xmm14 + psubq %xmm8, %xmm11 + pand %xmm11, %xmm14 + movdqa %xmm8, %xmm11 + pcmpgtd %xmm12, %xmm11 + por %xmm11, %xmm14 + pshufd $245, %xmm14, %xmm14 + movdqa %xmm14, %xmm11 + pandn %xmm8, %xmm11 + pand %xmm14, %xmm8 + movaps %xmm11, -352(%rbp) + movdqa %xmm14, %xmm11 + pandn %xmm12, %xmm11 + movdqa -144(%rbp), %xmm12 + por %xmm11, %xmm8 + movaps %xmm8, -64(%rbp) + movdqa %xmm12, %xmm8 + movdqa %xmm12, %xmm11 + pcmpeqd %xmm5, %xmm8 + psubq %xmm5, %xmm11 + pand %xmm11, %xmm8 + movdqa %xmm5, %xmm11 + pcmpgtd %xmm12, %xmm11 + por %xmm11, %xmm8 + pshufd $245, %xmm8, %xmm8 + movdqa %xmm8, %xmm11 + pandn %xmm5, %xmm11 + pand %xmm8, %xmm5 + movdqa %xmm11, %xmm13 + movdqa %xmm8, %xmm11 + pand -144(%rbp), %xmm8 + pandn %xmm12, %xmm11 + movdqa %xmm0, %xmm12 + por %xmm11, %xmm5 + movdqa %xmm0, %xmm11 + psubq %xmm1, %xmm12 + pcmpeqd %xmm1, %xmm11 + por %xmm13, %xmm8 + pshufd $78, %xmm8, %xmm8 + pand %xmm12, %xmm11 + movdqa %xmm1, %xmm12 + pcmpgtd -112(%rbp), %xmm12 + por %xmm12, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm12 + movdqa %xmm11, %xmm0 + pandn -112(%rbp), %xmm0 + pandn %xmm1, %xmm12 + pand %xmm11, %xmm1 + por %xmm0, %xmm1 + movdqa -288(%rbp), %xmm0 + movdqa -112(%rbp), %xmm6 + por -272(%rbp), %xmm7 + pand -96(%rbp), %xmm3 + pand %xmm11, %xmm6 + pand -80(%rbp), %xmm0 + por -256(%rbp), %xmm10 + pshufd $78, %xmm7, %xmm7 + por %xmm12, %xmm6 + movdqa %xmm9, %xmm12 + movdqa -208(%rbp), %xmm15 + movaps %xmm7, -96(%rbp) + movdqa %xmm8, %xmm7 + pshufd $78, %xmm6, %xmm6 + pshufd $78, %xmm10, %xmm10 + pcmpeqd %xmm9, %xmm7 + movaps %xmm6, -112(%rbp) + movdqa %xmm9, %xmm6 + por -304(%rbp), %xmm0 + psubq %xmm8, %xmm6 + movaps %xmm10, -80(%rbp) + por -320(%rbp), %xmm3 + pand -160(%rbp), %xmm14 + por -352(%rbp), %xmm14 + pshufd $78, %xmm0, %xmm0 + pand %xmm6, %xmm7 + movdqa %xmm8, %xmm6 + pshufd $78, %xmm3, %xmm3 + pcmpgtd %xmm9, %xmm6 + pshufd $78, %xmm14, %xmm14 + por %xmm6, %xmm7 + pshufd $245, %xmm7, %xmm7 + pand %xmm7, %xmm12 + movdqa %xmm7, %xmm6 + movdqa %xmm12, %xmm13 + movdqa %xmm7, %xmm12 + pandn %xmm8, %xmm6 + pandn %xmm9, %xmm12 + por %xmm6, %xmm13 + pand %xmm8, %xmm7 + movdqa -176(%rbp), %xmm9 + movaps %xmm12, -272(%rbp) + movdqa %xmm5, %xmm12 + movdqa %xmm9, %xmm11 + movdqa %xmm9, %xmm6 + psubq %xmm10, %xmm12 + pcmpeqd %xmm2, %xmm11 + psubq %xmm2, %xmm6 + pand %xmm6, %xmm11 + movdqa %xmm2, %xmm6 + pcmpgtd %xmm9, %xmm6 + por %xmm6, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm6 + pandn %xmm2, %xmm6 + pand %xmm11, %xmm2 + movaps %xmm6, -288(%rbp) + movdqa %xmm11, %xmm6 + pandn %xmm9, %xmm6 + por %xmm6, %xmm2 + movdqa %xmm10, %xmm6 + pcmpeqd %xmm5, %xmm6 + pand %xmm12, %xmm6 + movdqa %xmm10, %xmm12 + pcmpgtd %xmm5, %xmm12 + por %xmm12, %xmm6 + pshufd $245, %xmm6, %xmm6 + movdqa %xmm6, %xmm10 + movdqa %xmm6, %xmm12 + pandn -80(%rbp), %xmm12 + pandn %xmm5, %xmm10 + movaps %xmm10, -304(%rbp) + movdqa %xmm15, %xmm10 + movdqa %xmm12, %xmm9 + movdqa %xmm5, %xmm12 + pcmpeqd %xmm0, %xmm10 + movdqa %xmm15, %xmm5 + pand %xmm6, %xmm12 + psubq %xmm0, %xmm5 + por %xmm9, %xmm12 + pand %xmm5, %xmm10 + movdqa %xmm0, %xmm5 + pcmpgtd %xmm15, %xmm5 + por %xmm5, %xmm10 + pshufd $245, %xmm10, %xmm10 + movdqa %xmm10, %xmm5 + pandn %xmm0, %xmm5 + pand %xmm10, %xmm0 + movaps %xmm5, -320(%rbp) + movdqa %xmm10, %xmm5 + pandn %xmm15, %xmm5 + movdqa %xmm4, %xmm15 + por %xmm5, %xmm0 + movdqa -112(%rbp), %xmm5 + movaps %xmm0, -144(%rbp) + psubq %xmm5, %xmm15 + movdqa %xmm15, %xmm9 + movdqa %xmm5, %xmm15 + pcmpeqd %xmm4, %xmm5 + pand %xmm9, %xmm5 + movdqa %xmm15, %xmm9 + movdqa %xmm4, %xmm15 + pcmpgtd %xmm4, %xmm9 + por %xmm9, %xmm5 + pshufd $245, %xmm5, %xmm5 + movdqa %xmm5, %xmm9 + pandn -112(%rbp), %xmm9 + pand %xmm5, %xmm15 + por %xmm9, %xmm15 + movdqa %xmm5, %xmm9 + movaps %xmm15, -160(%rbp) + pandn %xmm4, %xmm9 + movdqa -192(%rbp), %xmm15 + movaps %xmm9, -336(%rbp) + movdqa %xmm15, %xmm9 + movdqa %xmm15, %xmm4 + pcmpeqd %xmm14, %xmm9 + psubq %xmm14, %xmm4 + pand %xmm4, %xmm9 + movdqa %xmm14, %xmm4 + pcmpgtd %xmm15, %xmm4 + por %xmm4, %xmm9 + pshufd $245, %xmm9, %xmm9 + movdqa %xmm9, %xmm0 + movdqa %xmm9, %xmm4 + pandn %xmm15, %xmm0 + movdqa -96(%rbp), %xmm15 + pandn %xmm14, %xmm4 + pand %xmm9, %xmm14 + por %xmm0, %xmm14 + movaps %xmm4, -352(%rbp) + movdqa %xmm1, %xmm0 + movdqa %xmm15, %xmm4 + movaps %xmm14, -128(%rbp) + movdqa %xmm15, %xmm14 + psubq %xmm15, %xmm0 + pcmpeqd %xmm1, %xmm4 + pcmpgtd %xmm1, %xmm14 + pand %xmm0, %xmm4 + por %xmm14, %xmm4 + pshufd $245, %xmm4, %xmm4 + movdqa %xmm4, %xmm0 + pandn -96(%rbp), %xmm0 + movdqa %xmm0, %xmm14 + movdqa %xmm1, %xmm0 + pand %xmm4, %xmm0 + movdqa %xmm0, %xmm15 + movdqa %xmm4, %xmm0 + pandn %xmm1, %xmm0 + por %xmm14, %xmm15 + movaps %xmm0, -368(%rbp) + movdqa -64(%rbp), %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm14 + pcmpeqd %xmm3, %xmm1 + psubq %xmm3, %xmm14 + pand %xmm14, %xmm1 + movdqa %xmm3, %xmm14 + pcmpgtd -64(%rbp), %xmm14 + pand -80(%rbp), %xmm6 + pand -112(%rbp), %xmm5 + pand -176(%rbp), %xmm11 + por -272(%rbp), %xmm7 + por -288(%rbp), %xmm11 + pand -96(%rbp), %xmm4 + por %xmm14, %xmm1 + pshufd $78, %xmm7, %xmm7 + movdqa -208(%rbp), %xmm8 + por -304(%rbp), %xmm6 + pshufd $245, %xmm1, %xmm1 + pshufd $78, %xmm11, %xmm11 + pand -192(%rbp), %xmm9 + movdqa %xmm1, %xmm14 + movdqa %xmm1, %xmm0 + pandn -64(%rbp), %xmm0 + pandn %xmm3, %xmm14 + pand %xmm1, %xmm3 + pand %xmm10, %xmm8 + por %xmm0, %xmm3 + movdqa %xmm13, %xmm10 + por -320(%rbp), %xmm8 + movaps %xmm14, -384(%rbp) + movaps %xmm3, -256(%rbp) + movdqa -64(%rbp), %xmm3 + psubq %xmm11, %xmm10 + pshufd $78, %xmm6, %xmm6 + pshufd $78, %xmm8, %xmm8 + por -336(%rbp), %xmm5 + por -352(%rbp), %xmm9 + pand %xmm1, %xmm3 + movdqa %xmm11, %xmm1 + por -384(%rbp), %xmm3 + pcmpeqd %xmm13, %xmm1 + pshufd $78, %xmm9, %xmm9 + pshufd $78, %xmm5, %xmm5 + pshufd $78, %xmm3, %xmm3 + por -368(%rbp), %xmm4 + pand %xmm10, %xmm1 + movdqa %xmm11, %xmm10 + pshufd $78, %xmm4, %xmm4 + pcmpgtd %xmm13, %xmm10 + por %xmm10, %xmm1 + movdqa %xmm13, %xmm10 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm0 + pand %xmm1, %xmm10 + pandn %xmm11, %xmm0 + por %xmm0, %xmm10 + movdqa %xmm1, %xmm0 + pand %xmm11, %xmm1 + movdqa %xmm7, %xmm11 + pandn %xmm13, %xmm0 + movdqa %xmm2, %xmm13 + pcmpeqd %xmm2, %xmm11 + psubq %xmm7, %xmm13 + por %xmm0, %xmm1 + movdqa %xmm2, %xmm0 + movaps %xmm1, -64(%rbp) + movdqa -144(%rbp), %xmm1 + pand %xmm13, %xmm11 + movdqa %xmm7, %xmm13 + pcmpgtd %xmm2, %xmm13 + por %xmm13, %xmm11 + pshufd $245, %xmm11, %xmm11 + movdqa %xmm11, %xmm13 + pand %xmm11, %xmm0 + pandn %xmm7, %xmm13 + por %xmm0, %xmm13 + movdqa %xmm11, %xmm0 + pand %xmm7, %xmm11 + pandn %xmm2, %xmm0 + movdqa %xmm8, %xmm2 + movdqa %xmm12, %xmm7 + pcmpeqd %xmm12, %xmm2 + psubq %xmm8, %xmm7 + movdqa %xmm0, %xmm14 + por %xmm11, %xmm14 + movdqa %xmm12, %xmm11 + movdqa %xmm1, %xmm0 + pand %xmm7, %xmm2 + movdqa %xmm8, %xmm7 + pcmpgtd %xmm12, %xmm7 + por %xmm7, %xmm2 + pshufd $245, %xmm2, %xmm2 + movdqa %xmm2, %xmm7 + pand %xmm2, %xmm11 + pandn %xmm8, %xmm7 + por %xmm7, %xmm11 + movdqa %xmm2, %xmm7 + pand %xmm8, %xmm2 + pandn %xmm12, %xmm7 + por %xmm7, %xmm2 + movdqa %xmm1, %xmm7 + movdqa %xmm2, %xmm12 + movdqa %xmm6, %xmm2 + psubq %xmm6, %xmm7 + pcmpeqd %xmm1, %xmm2 + pand %xmm7, %xmm2 + movdqa %xmm6, %xmm7 + pcmpgtd %xmm1, %xmm7 + por %xmm7, %xmm2 + pshufd $245, %xmm2, %xmm2 + pand %xmm2, %xmm0 + movdqa %xmm2, %xmm7 + pandn %xmm6, %xmm7 + movdqa %xmm0, %xmm8 + por %xmm7, %xmm8 + movdqa %xmm2, %xmm7 + pand %xmm6, %xmm2 + pandn %xmm1, %xmm7 + movdqa -160(%rbp), %xmm1 + por %xmm2, %xmm7 + movdqa %xmm1, %xmm0 + movdqa %xmm1, %xmm2 + movdqa %xmm1, %xmm6 + pcmpeqd %xmm9, %xmm0 + psubq %xmm9, %xmm2 + pand %xmm2, %xmm0 + movdqa %xmm9, %xmm2 + pcmpgtd %xmm1, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm2 + pand %xmm0, %xmm6 + pandn %xmm9, %xmm2 + por %xmm2, %xmm6 + movdqa %xmm0, %xmm2 + pand %xmm9, %xmm0 + pandn %xmm1, %xmm2 + movaps %xmm6, -144(%rbp) + movdqa -128(%rbp), %xmm6 + por %xmm2, %xmm0 + movdqa %xmm0, %xmm9 + movdqa %xmm6, %xmm0 + movdqa %xmm6, %xmm2 + pcmpeqd %xmm5, %xmm0 + psubq %xmm5, %xmm2 + pand %xmm2, %xmm0 + movdqa %xmm5, %xmm2 + pcmpgtd %xmm6, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm1 + pandn -128(%rbp), %xmm1 + movdqa %xmm0, %xmm2 + pand %xmm0, %xmm6 + pand %xmm5, %xmm0 + pandn %xmm5, %xmm2 + por %xmm1, %xmm0 + movdqa %xmm3, %xmm1 + por %xmm2, %xmm6 + pcmpeqd %xmm15, %xmm1 + movdqa %xmm15, %xmm2 + movdqa %xmm15, %xmm5 + movaps %xmm0, -160(%rbp) + psubq %xmm3, %xmm2 + movdqa %xmm1, %xmm0 + pand %xmm2, %xmm0 + movdqa %xmm3, %xmm2 + pcmpgtd %xmm15, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm2 + pand %xmm0, %xmm5 + pandn %xmm3, %xmm2 + por %xmm2, %xmm5 + movdqa %xmm0, %xmm2 + pand %xmm3, %xmm0 + pandn %xmm15, %xmm2 + movdqa -256(%rbp), %xmm15 + por %xmm2, %xmm0 + movdqa %xmm15, %xmm1 + movdqa %xmm15, %xmm2 + movdqa %xmm15, %xmm3 + movaps %xmm0, -176(%rbp) + pcmpeqd %xmm4, %xmm1 + psubq %xmm4, %xmm2 + movdqa %xmm1, %xmm0 + pand %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pcmpgtd %xmm15, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm2 + pand %xmm0, %xmm3 + pandn %xmm4, %xmm2 + por %xmm2, %xmm3 + movdqa %xmm0, %xmm2 + pand %xmm4, %xmm0 + pandn %xmm15, %xmm2 + movdqa %xmm0, %xmm4 + movaps %xmm3, -192(%rbp) + pshufd $78, %xmm10, %xmm3 + por %xmm2, %xmm4 + movdqa %xmm10, %xmm2 + movdqa %xmm3, %xmm1 + movdqa -64(%rbp), %xmm15 + psubq %xmm3, %xmm2 + pcmpgtd %xmm10, %xmm1 + movdqa %xmm2, %xmm0 + movdqa %xmm10, %xmm2 + pcmpeqd %xmm3, %xmm2 + pand %xmm0, %xmm2 + por %xmm1, %xmm2 + pshufd $245, %xmm2, %xmm0 + punpckhqdq %xmm0, %xmm0 + movdqa %xmm0, %xmm2 + pand %xmm3, %xmm0 + pshufd $78, %xmm15, %xmm3 + pandn %xmm10, %xmm2 + movdqa %xmm0, %xmm1 + por %xmm2, %xmm1 + movdqa %xmm15, %xmm2 + pcmpeqd %xmm3, %xmm2 + movaps %xmm1, -112(%rbp) + movdqa %xmm15, %xmm1 + psubq %xmm3, %xmm1 + pand %xmm1, %xmm2 + movdqa %xmm3, %xmm1 + pcmpgtd %xmm15, %xmm1 + por %xmm1, %xmm2 + pshufd $245, %xmm2, %xmm0 + punpckhqdq %xmm0, %xmm0 + movdqa %xmm0, %xmm2 + pand %xmm3, %xmm0 + pshufd $78, %xmm9, %xmm3 + pandn %xmm15, %xmm2 + movdqa %xmm0, %xmm1 + por %xmm2, %xmm1 + pshufd $78, %xmm13, %xmm2 + movaps %xmm1, -128(%rbp) + movdqa %xmm13, %xmm1 + movdqa %xmm2, %xmm15 + psubq %xmm2, %xmm1 + pcmpgtd %xmm13, %xmm15 + movdqa %xmm1, %xmm0 + movdqa %xmm13, %xmm1 + pcmpeqd %xmm2, %xmm1 + pand %xmm0, %xmm1 + por %xmm15, %xmm1 + pshufd $245, %xmm1, %xmm0 + punpckhqdq %xmm0, %xmm0 + movdqa %xmm0, %xmm1 + pand %xmm2, %xmm0 + pshufd $78, %xmm14, %xmm2 + pandn %xmm13, %xmm1 + movdqa %xmm0, %xmm15 + movdqa %xmm2, %xmm10 + por %xmm1, %xmm15 + pcmpgtd %xmm14, %xmm10 + movdqa %xmm14, %xmm1 + psubq %xmm2, %xmm1 + movdqa %xmm1, %xmm0 + movdqa %xmm14, %xmm1 + pcmpeqd %xmm2, %xmm1 + pand %xmm0, %xmm1 + por %xmm10, %xmm1 + pshufd $245, %xmm1, %xmm0 + punpckhqdq %xmm0, %xmm0 + movdqa %xmm0, %xmm1 + pand %xmm2, %xmm0 + pshufd $78, %xmm11, %xmm2 + pandn %xmm14, %xmm1 + movdqa %xmm0, %xmm10 + movdqa %xmm2, %xmm14 + por %xmm1, %xmm10 + pcmpgtd %xmm11, %xmm14 + movdqa %xmm11, %xmm1 + psubq %xmm2, %xmm1 + movdqa %xmm1, %xmm0 + movdqa %xmm11, %xmm1 + pcmpeqd %xmm2, %xmm1 + pand %xmm0, %xmm1 + por %xmm14, %xmm1 + pshufd $245, %xmm1, %xmm0 + punpckhqdq %xmm0, %xmm0 + movdqa %xmm0, %xmm1 + pand %xmm2, %xmm0 + pandn %xmm11, %xmm1 + movdqa %xmm0, %xmm2 + pshufd $78, %xmm4, %xmm11 + por %xmm1, %xmm2 + movdqa %xmm12, %xmm1 + movaps %xmm2, -64(%rbp) + pshufd $78, %xmm12, %xmm2 + movdqa -160(%rbp), %xmm13 + psubq %xmm2, %xmm1 + movdqa %xmm2, %xmm14 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm12, %xmm14 + movdqa %xmm12, %xmm1 + pcmpeqd %xmm2, %xmm1 + pand %xmm0, %xmm1 + por %xmm14, %xmm1 + pshufd $245, %xmm1, %xmm0 + punpckhqdq %xmm0, %xmm0 + movdqa %xmm0, %xmm1 + pand %xmm2, %xmm0 + pandn %xmm12, %xmm1 + movdqa %xmm0, %xmm2 + por %xmm1, %xmm2 + movdqa %xmm8, %xmm1 + movaps %xmm2, -80(%rbp) + pshufd $78, %xmm8, %xmm2 + psubq %xmm2, %xmm1 + movdqa %xmm2, %xmm14 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm8, %xmm14 + movdqa %xmm8, %xmm1 + pcmpeqd %xmm2, %xmm1 + pand %xmm0, %xmm1 + por %xmm14, %xmm1 + pshufd $245, %xmm1, %xmm0 + punpckhqdq %xmm0, %xmm0 + movdqa %xmm0, %xmm1 + pand %xmm2, %xmm0 + pandn %xmm8, %xmm1 + movdqa %xmm0, %xmm2 + por %xmm1, %xmm2 + movdqa %xmm7, %xmm1 + movaps %xmm2, -96(%rbp) + pshufd $78, %xmm7, %xmm2 + psubq %xmm2, %xmm1 + movdqa %xmm2, %xmm14 + movdqa %xmm1, %xmm0 + pcmpgtd %xmm7, %xmm14 + movdqa %xmm7, %xmm1 + pcmpeqd %xmm2, %xmm1 + pand %xmm0, %xmm1 + por %xmm14, %xmm1 + pshufd $245, %xmm1, %xmm0 + punpckhqdq %xmm0, %xmm0 + movdqa %xmm0, %xmm1 + pand %xmm2, %xmm0 + pandn %xmm7, %xmm1 + movdqa -144(%rbp), %xmm7 + movdqa %xmm0, %xmm14 + por %xmm1, %xmm14 + pshufd $78, %xmm7, %xmm2 + movdqa %xmm7, %xmm1 + movdqa %xmm7, %xmm0 + pcmpeqd %xmm2, %xmm1 + movdqa %xmm2, %xmm8 + psubq %xmm2, %xmm0 + pcmpgtd %xmm7, %xmm8 + pand %xmm0, %xmm1 + por %xmm8, %xmm1 + pshufd $78, %xmm5, %xmm8 + pshufd $245, %xmm1, %xmm0 + punpckhqdq %xmm0, %xmm0 + movdqa %xmm0, %xmm1 + pand %xmm2, %xmm0 + movdqa %xmm9, %xmm2 + pandn %xmm7, %xmm1 + psubq %xmm3, %xmm2 + pshufd $78, %xmm6, %xmm7 + por %xmm1, %xmm0 + movdqa %xmm9, %xmm1 + pcmpeqd %xmm3, %xmm1 + pand %xmm2, %xmm1 + movdqa %xmm3, %xmm2 + pcmpgtd %xmm9, %xmm2 + por %xmm2, %xmm1 + pshufd $245, %xmm1, %xmm2 + punpckhqdq %xmm2, %xmm2 + movdqa %xmm2, %xmm1 + pand %xmm3, %xmm2 + movdqa %xmm6, %xmm3 + pandn %xmm9, %xmm1 + pcmpeqd %xmm7, %xmm3 + por %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + psubq %xmm7, %xmm1 + pand %xmm1, %xmm3 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm3 + pshufd $245, %xmm3, %xmm1 + punpckhqdq %xmm1, %xmm1 + movdqa %xmm1, %xmm3 + pandn %xmm6, %xmm3 + movdqa %xmm1, %xmm6 + movdqa %xmm13, %xmm1 + pand %xmm7, %xmm6 + movdqa %xmm13, %xmm7 + por %xmm3, %xmm6 + pshufd $78, %xmm13, %xmm3 + pcmpeqd %xmm3, %xmm1 + psubq %xmm3, %xmm7 + pand %xmm7, %xmm1 + movdqa %xmm3, %xmm7 + pcmpgtd %xmm13, %xmm7 + por %xmm7, %xmm1 + pshufd $245, %xmm1, %xmm7 + punpckhqdq %xmm7, %xmm7 + movdqa %xmm7, %xmm1 + pand %xmm3, %xmm7 + movdqa %xmm5, %xmm3 + pandn %xmm13, %xmm1 + pcmpeqd %xmm8, %xmm3 + movdqa -176(%rbp), %xmm13 + por %xmm1, %xmm7 + movdqa %xmm5, %xmm1 + psubq %xmm8, %xmm1 + pand %xmm1, %xmm3 + movdqa %xmm8, %xmm1 + pcmpgtd %xmm5, %xmm1 + por %xmm1, %xmm3 + pshufd $245, %xmm3, %xmm1 + punpckhqdq %xmm1, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm8, %xmm1 + movdqa %xmm13, %xmm8 + pandn %xmm5, %xmm3 + pshufd $78, %xmm13, %xmm5 + por %xmm3, %xmm1 + movdqa %xmm13, %xmm3 + movdqa %xmm5, %xmm9 + pcmpeqd %xmm5, %xmm3 + pcmpgtd %xmm13, %xmm9 + psubq %xmm5, %xmm8 + pand %xmm8, %xmm3 + por %xmm9, %xmm3 + pshufd $245, %xmm3, %xmm9 + punpckhqdq %xmm9, %xmm9 + movdqa %xmm9, %xmm3 + pand %xmm5, %xmm9 + pandn %xmm13, %xmm3 + movdqa -192(%rbp), %xmm13 + por %xmm3, %xmm9 + pshufd $78, %xmm13, %xmm5 + movdqa %xmm13, %xmm3 + movdqa %xmm13, %xmm8 + pcmpeqd %xmm5, %xmm3 + psubq %xmm5, %xmm8 + pand %xmm8, %xmm3 + movdqa %xmm5, %xmm8 + pcmpgtd %xmm13, %xmm8 + por %xmm8, %xmm3 + pshufd $245, %xmm3, %xmm8 + punpckhqdq %xmm8, %xmm8 + movdqa %xmm8, %xmm3 + pand %xmm5, %xmm8 + movdqa %xmm4, %xmm5 + pandn %xmm13, %xmm3 + pcmpeqd %xmm11, %xmm5 + por %xmm3, %xmm8 + movdqa %xmm4, %xmm3 + psubq %xmm11, %xmm3 + pand %xmm3, %xmm5 + movdqa %xmm11, %xmm3 + pcmpgtd %xmm4, %xmm3 + por %xmm3, %xmm5 + pshufd $245, %xmm5, %xmm3 + punpckhqdq %xmm3, %xmm3 + movdqa %xmm3, %xmm5 + pand %xmm11, %xmm3 + pandn %xmm4, %xmm5 + por %xmm5, %xmm3 +.L366: + movdqa -112(%rbp), %xmm4 + movq -232(%rbp), %rdx + movups %xmm4, (%rdx) + movdqa -128(%rbp), %xmm4 + movups %xmm4, (%r15) + movdqa -64(%rbp), %xmm4 + movups %xmm15, (%r14) + movups %xmm10, 0(%r13) + movups %xmm4, (%r12) + movdqa -80(%rbp), %xmm4 + movups %xmm4, (%rbx) + movdqa -96(%rbp), %xmm4 + movq -224(%rbp), %rbx + movups %xmm4, (%r11) + movups %xmm14, (%r10) + movups %xmm0, (%r9) + movups %xmm2, (%r8) + movups %xmm6, (%rdi) + movups %xmm7, (%rsi) + movups %xmm1, (%rcx) + movq -216(%rbp), %rcx + movups %xmm9, (%rcx) + movups %xmm8, (%rbx) + movups %xmm3, (%rax) + addq $224, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L368: + .cfi_restore_state + movdqa -144(%rbp), %xmm3 + movdqa -160(%rbp), %xmm8 + movdqa -176(%rbp), %xmm9 + jmp .L366 + .cfi_endproc +.LFE18796: + .size _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0, .-_ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + .section .text._ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0, @function +_ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0: +.LFB18797: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + movq %rcx, %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbx + subq $2952, %rsp + .cfi_offset 14, -32 + .cfi_offset 13, -40 + .cfi_offset 12, -48 + .cfi_offset 3, -56 + movq %rdi, -2648(%rbp) + movq %rsi, -2808(%rbp) + movq %rdx, -2760(%rbp) + movq %r8, -2736(%rbp) + movq %r9, -2768(%rbp) + cmpq $32, %rdx + jbe .L594 + movq %rdi, %r12 + movq %rdi, %r14 + shrq $3, %r12 + movq %r12, %rax + andl $7, %eax + jne .L595 + movq %rdx, %r11 + movq %rdi, %r13 + movq %r8, %rax +.L381: + movq 8(%rax), %rdx + movq 16(%rax), %r9 + movq %rdx, %rcx + leaq 1(%r9), %rdi + movq %rdx, %rsi + xorq (%rax), %rdi + rolq $24, %rcx + shrq $11, %rsi + movq %rcx, %rax + leaq (%rdx,%rdx,8), %rcx + leaq 2(%r9), %rdx + addq %rdi, %rax + xorq %rsi, %rcx + movq %rax, %rsi + xorq %rdx, %rcx + leaq (%rax,%rax,8), %rdx + movq %rax, %r8 + rolq $24, %rsi + shrq $11, %r8 + leaq 3(%r9), %rax + addq %rcx, %rsi + xorq %r8, %rdx + movq %rsi, %r10 + movq %rsi, %r8 + xorq %rax, %rdx + shrq $11, %r8 + rolq $24, %r10 + leaq (%rsi,%rsi,8), %rax + leaq 4(%r9), %rsi + addq %rdx, %r10 + xorq %r8, %rax + addq $5, %r9 + xorq %rsi, %rax + movq %r10, %r8 + movq %r10, %rsi + shrq $11, %rsi + rolq $24, %r8 + addq %rax, %r8 + movq %rsi, %rbx + leaq (%r10,%r10,8), %rsi + xorq %rbx, %rsi + movq %r8, %rbx + leaq (%r8,%r8,8), %r10 + rolq $24, %r8 + xorq %r9, %rsi + shrq $11, %rbx + xorq %rbx, %r10 + addq %rsi, %r8 + movq -2736(%rbp), %rbx + movl %esi, %esi + movq %r10, %xmm0 + movq %r8, %xmm6 + movl %ecx, %r10d + movabsq $34359738359, %r8 + punpcklqdq %xmm6, %xmm0 + movq %r9, 16(%rbx) + movl %edx, %r9d + movups %xmm0, (%rbx) + movq %r11, %rbx + shrq $3, %rbx + cmpq %r8, %r11 + movl $4294967295, %r8d + movl %edi, %r11d + cmova %r8, %rbx + shrq $32, %rdi + movl %eax, %r8d + shrq $32, %rcx + shrq $32, %rdx + imulq %rbx, %r11 + shrq $32, %rax + imulq %rbx, %rdi + imulq %rbx, %r10 + imulq %rbx, %rcx + shrq $32, %r11 + imulq %rbx, %r9 + shrq $32, %rdi + salq $6, %r11 + imulq %rbx, %rdx + shrq $32, %r10 + salq $6, %rdi + addq %r13, %r11 + imulq %rbx, %r8 + shrq $32, %rcx + salq $6, %r10 + addq %r13, %rdi + shrq $32, %r9 + salq $6, %rcx + addq %r13, %r10 + shrq $32, %rdx + salq $6, %r9 + addq %r13, %rcx + shrq $32, %r8 + salq $6, %rdx + addq %r13, %r9 + salq $6, %r8 + addq %r13, %rdx + addq %r13, %r8 + imulq %rbx, %rax + imulq %rbx, %rsi + xorl %ebx, %ebx + shrq $32, %rax + shrq $32, %rsi + salq $6, %rax + salq $6, %rsi + addq %r13, %rax + addq %r13, %rsi +.L383: + movdqa (%r10,%rbx,8), %xmm2 + movdqa (%r11,%rbx,8), %xmm4 + movdqa (%rdi,%rbx,8), %xmm0 + movdqa %xmm2, %xmm3 + movdqa %xmm4, %xmm1 + pcmpeqd %xmm4, %xmm3 + psubq %xmm2, %xmm1 + pand %xmm3, %xmm1 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm4, %xmm3 + por %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm5 + pand %xmm1, %xmm3 + pandn %xmm2, %xmm5 + pand %xmm1, %xmm2 + por %xmm5, %xmm3 + movdqa %xmm1, %xmm5 + pandn %xmm4, %xmm5 + movdqa %xmm0, %xmm4 + movdqa %xmm3, %xmm1 + pcmpeqd %xmm3, %xmm4 + psubq %xmm0, %xmm1 + por %xmm5, %xmm2 + pand %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm3, %xmm4 + por %xmm4, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm0 + pandn %xmm3, %xmm4 + movdqa %xmm2, %xmm3 + por %xmm4, %xmm0 + movdqa (%rcx,%rbx,8), %xmm4 + pcmpeqd %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + psubq %xmm2, %xmm1 + pand %xmm3, %xmm1 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm0, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm1, %xmm0 + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm3 + movdqa (%rdx,%rbx,8), %xmm2 + por %xmm3, %xmm0 + movdqa %xmm2, %xmm3 + psubq %xmm2, %xmm1 + movaps %xmm0, (%r15,%rbx,8) + movdqa (%r9,%rbx,8), %xmm0 + pcmpeqd %xmm4, %xmm3 + pand %xmm3, %xmm1 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm4, %xmm3 + por %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm5 + pand %xmm1, %xmm3 + pandn %xmm2, %xmm5 + pand %xmm1, %xmm2 + por %xmm5, %xmm3 + movdqa %xmm1, %xmm5 + pandn %xmm4, %xmm5 + movdqa %xmm0, %xmm4 + movdqa %xmm3, %xmm1 + pcmpeqd %xmm3, %xmm4 + psubq %xmm0, %xmm1 + por %xmm5, %xmm2 + pand %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm3, %xmm4 + por %xmm4, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm0 + pandn %xmm3, %xmm4 + movdqa %xmm2, %xmm3 + por %xmm4, %xmm0 + movdqa (%r8,%rbx,8), %xmm4 + pcmpeqd %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + psubq %xmm2, %xmm1 + pand %xmm3, %xmm1 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm0, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm1, %xmm0 + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm3 + movdqa (%rsi,%rbx,8), %xmm2 + por %xmm3, %xmm0 + movdqa %xmm2, %xmm3 + psubq %xmm2, %xmm1 + movaps %xmm0, 64(%r15,%rbx,8) + movdqa (%rax,%rbx,8), %xmm0 + pcmpeqd %xmm4, %xmm3 + pand %xmm3, %xmm1 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm4, %xmm3 + por %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm5 + pand %xmm1, %xmm3 + pandn %xmm2, %xmm5 + pand %xmm1, %xmm2 + por %xmm5, %xmm3 + movdqa %xmm1, %xmm5 + pandn %xmm4, %xmm5 + movdqa %xmm0, %xmm4 + movdqa %xmm3, %xmm1 + pcmpeqd %xmm3, %xmm4 + psubq %xmm0, %xmm1 + por %xmm5, %xmm2 + pand %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm3, %xmm4 + por %xmm4, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm0 + pandn %xmm3, %xmm4 + movdqa %xmm2, %xmm3 + por %xmm4, %xmm0 + pcmpeqd %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + psubq %xmm2, %xmm1 + pand %xmm3, %xmm1 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm0, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm1, %xmm0 + pandn %xmm2, %xmm3 + por %xmm3, %xmm0 + movaps %xmm0, 128(%r15,%rbx,8) + addq $2, %rbx + cmpq $8, %rbx + jne .L383 + movq (%r15), %xmm2 + movdqa 16(%r15), %xmm0 + leaq 192(%r15), %r13 + movdqa (%r15), %xmm1 + punpcklqdq %xmm2, %xmm2 + pxor %xmm2, %xmm1 + pxor %xmm2, %xmm0 + por %xmm1, %xmm0 + movdqa 32(%r15), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 48(%r15), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 64(%r15), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 80(%r15), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 96(%r15), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 112(%r15), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 128(%r15), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 144(%r15), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 160(%r15), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 176(%r15), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + pxor %xmm1, %xmm1 + pcmpeqd %xmm1, %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm1, %xmm0 + movmskpd %xmm0, %eax + cmpl $3, %eax + je .L384 + movdqa .LC4(%rip), %xmm0 + movl $2, %esi + movq %r15, %rdi + movups %xmm0, 192(%r15) + movups %xmm0, 208(%r15) + movups %xmm0, 224(%r15) + movups %xmm0, 240(%r15) + movups %xmm0, 256(%r15) + call _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq (%r15), %xmm2 + pcmpeqd %xmm0, %xmm0 + movq 184(%r15), %xmm1 + punpcklqdq %xmm1, %xmm1 + punpcklqdq %xmm2, %xmm2 + paddq %xmm1, %xmm0 + pcmpeqd %xmm2, %xmm0 + pshufd $177, %xmm0, %xmm3 + pand %xmm3, %xmm0 + movmskpd %xmm0, %eax + cmpl $3, %eax + jne .L386 + movq -2760(%rbp), %rsi + leaq -64(%rbp), %rdx + movq %r13, %rcx + movdqa %xmm2, %xmm0 + movq -2648(%rbp), %rdi + call _ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L369 +.L386: + movq 96(%r15), %rax + cmpq %rax, 88(%r15) + jne .L482 + cmpq 80(%r15), %rax + jne .L425 + cmpq 72(%r15), %rax + jne .L483 + cmpq 64(%r15), %rax + jne .L484 + cmpq 56(%r15), %rax + jne .L485 + cmpq 48(%r15), %rax + jne .L486 + cmpq 40(%r15), %rax + jne .L487 + cmpq 32(%r15), %rax + jne .L488 + cmpq 24(%r15), %rax + jne .L489 + cmpq 16(%r15), %rax + jne .L490 + cmpq 8(%r15), %rax + jne .L491 + xorl %ebx, %ebx + movl $1, %edx + cmpq %rax, (%r15) + jne .L428 +.L427: + movq %rax, %xmm2 + punpcklqdq %xmm2, %xmm2 +.L593: + movl $1, -2812(%rbp) +.L423: + cmpq $0, -2768(%rbp) + je .L596 + movq -2760(%rbp), %rax + movq -2648(%rbp), %rsi + movaps %xmm2, -2624(%rbp) + subq $2, %rax + movdqu (%rsi,%rax,8), %xmm6 + movq %rax, %rcx + movq %rax, -2672(%rbp) + andl $7, %ecx + movq %rcx, -2688(%rbp) + movaps %xmm6, -2800(%rbp) + andl $6, %eax + je .L492 + movdqu (%rsi), %xmm4 + movdqa %xmm2, %xmm1 + movaps %xmm2, -2848(%rbp) + movdqa %xmm4, %xmm0 + psubq %xmm4, %xmm1 + movaps %xmm4, -2640(%rbp) + pcmpeqd %xmm2, %xmm0 + pand %xmm0, %xmm1 + movdqa %xmm4, %xmm0 + pcmpgtd %xmm2, %xmm0 + por %xmm0, %xmm1 + pcmpeqd %xmm0, %xmm0 + pshufd $245, %xmm1, %xmm1 + pxor %xmm1, %xmm0 + movaps %xmm1, -2832(%rbp) + movmskpd %xmm0, %ebx + movq %rbx, %rdi + salq $4, %rbx + call __popcountdi2@PLT + leaq _ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %rcx + movdqa -2640(%rbp), %xmm4 + movdqa (%rcx,%rbx), %xmm0 + movq %rcx, -2728(%rbp) + cltq + movq %rax, -2704(%rbp) + movaps %xmm0, -336(%rbp) + movzbl -328(%rbp), %edx + movd %xmm0, %r9d + movaps %xmm0, -352(%rbp) + andl $15, %r9d + movq %rdx, %rcx + movzbl -343(%rbp), %edx + movaps %xmm0, -320(%rbp) + movzbl -313(%rbp), %eax + movaps %xmm0, -368(%rbp) + andl $15, %ecx + movq %rdx, %rdi + movzbl -358(%rbp), %edx + movaps %xmm0, -304(%rbp) + movzbl -298(%rbp), %r14d + andl $15, %edi + andl $15, %eax + movaps %xmm0, -224(%rbp) + movzbl -223(%rbp), %r10d + andl $15, %edx + movaps %xmm0, -240(%rbp) + andl $15, %r14d + movzbl -238(%rbp), %r11d + movaps %xmm0, -256(%rbp) + movzbl -253(%rbp), %ebx + andl $15, %r10d + movaps %xmm0, -272(%rbp) + movzbl -268(%rbp), %r12d + andl $15, %r11d + movaps %xmm0, -288(%rbp) + movzbl -283(%rbp), %r13d + andl $15, %ebx + movaps %xmm0, -384(%rbp) + andl $15, %r12d + movq %rcx, -2640(%rbp) + andl $15, %r13d + movq %rdi, -2720(%rbp) + movq %rdx, -2656(%rbp) + movzbl -373(%rbp), %edx + movaps %xmm4, -208(%rbp) + movaps %xmm0, -400(%rbp) + movzbl -388(%rbp), %ecx + andl $15, %edx + movdqa .LC1(%rip), %xmm6 + movq -2704(%rbp), %xmm3 + movaps %xmm0, -416(%rbp) + movzbl -403(%rbp), %esi + movaps %xmm0, -432(%rbp) + andl $15, %ecx + movdqa %xmm6, %xmm5 + movzbl -418(%rbp), %edi + movaps %xmm0, -448(%rbp) + movzbl -208(%rbp,%rax), %eax + andl $15, %esi + punpcklqdq %xmm3, %xmm3 + movzbl -208(%rbp,%r14), %r14d + andl $15, %edi + movzbl -208(%rbp,%r13), %r13d + movaps %xmm6, -2752(%rbp) + salq $8, %rax + movzbl -208(%rbp,%rbx), %ebx + movzbl -208(%rbp,%r12), %r12d + pcmpeqd %xmm3, %xmm5 + orq %r14, %rax + movzbl -433(%rbp), %r8d + movzbl -208(%rbp,%r11), %r11d + salq $8, %rax + movzbl -208(%rbp,%rdi), %edi + movzbl -208(%rbp,%r10), %r10d + orq %r13, %rax + andl $15, %r8d + movzbl -208(%rbp,%rsi), %esi + movzbl -208(%rbp,%rcx), %ecx + salq $8, %rax + movzbl -208(%rbp,%r8), %r8d + movzbl -208(%rbp,%rdx), %edx + movzbl -208(%rbp,%r9), %r9d + orq %r12, %rax + salq $8, %rax + salq $8, %r8 + orq %rbx, %rax + salq $8, %rax + orq %r11, %rax + salq $8, %rax + orq %r10, %rax + salq $8, %rax + orq %rdi, %r8 + movq -2720(%rbp), %rdi + salq $8, %r8 + orq %r9, %rax + orq %rsi, %r8 + salq $8, %r8 + orq %rcx, %r8 + movq -2640(%rbp), %rcx + movq %rax, -2640(%rbp) + salq $8, %r8 + orq %rdx, %r8 + movq -2656(%rbp), %rdx + salq $8, %r8 + movzbl -208(%rbp,%rdx), %edx + orq %rdx, %r8 + movzbl -208(%rbp,%rdi), %edx + salq $8, %r8 + orq %rdx, %r8 + movzbl -208(%rbp,%rcx), %edx + salq $8, %r8 + orq %rdx, %r8 + movq %r8, -2632(%rbp) + movdqa .LC0(%rip), %xmm7 + movdqa -2832(%rbp), %xmm1 + movdqa -2848(%rbp), %xmm2 + movdqa %xmm7, %xmm0 + movaps %xmm7, -2784(%rbp) + psubq %xmm3, %xmm0 + pcmpgtd %xmm6, %xmm3 + pand %xmm5, %xmm0 + por %xmm3, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L433 + movq -2648(%rbp), %rsi + movdqa -2640(%rbp), %xmm6 + movq %xmm6, (%rsi) +.L433: + movhlps %xmm0, %xmm6 + movq %xmm6, %rax + testq %rax, %rax + je .L434 + movq -2648(%rbp), %rax + movdqa -2640(%rbp), %xmm6 + movhps %xmm6, 8(%rax) +.L434: + movq -2648(%rbp), %rax + movq -2704(%rbp), %rbx + movmskpd %xmm1, %r12d + movaps %xmm4, -2720(%rbp) + movq %r12, %rdi + movaps %xmm2, -2848(%rbp) + salq $4, %r12 + leaq (%rax,%rbx,8), %rbx + call __popcountdi2@PLT + movdqa -2720(%rbp), %xmm4 + movslq %eax, %rcx + movq %rcx, -2640(%rbp) + movq -2728(%rbp), %rcx + movaps %xmm4, -192(%rbp) + movdqa (%rcx,%r12), %xmm0 + movaps %xmm0, -576(%rbp) + movzbl -568(%rbp), %edx + movd %xmm0, %r9d + movaps %xmm0, -464(%rbp) + movzbl -463(%rbp), %eax + andl $15, %r9d + movq %rdx, %rcx + movaps %xmm0, -592(%rbp) + movzbl -583(%rbp), %edx + andl $15, %eax + movaps %xmm0, -608(%rbp) + andl $15, %ecx + movq %rdx, %rsi + movzbl -598(%rbp), %edx + movq %rax, -2704(%rbp) + movaps %xmm0, -560(%rbp) + movzbl -553(%rbp), %eax + andl $15, %esi + movaps %xmm0, -544(%rbp) + movq %rdx, %rdi + movzbl -538(%rbp), %r14d + andl $15, %edi + andl $15, %eax + movq %rcx, -2720(%rbp) + movq %rsi, -2656(%rbp) + andl $15, %r14d + movq %rdi, -2832(%rbp) + movaps %xmm0, -480(%rbp) + movzbl -478(%rbp), %r10d + movaps %xmm0, -496(%rbp) + movzbl -493(%rbp), %r11d + movaps %xmm0, -512(%rbp) + movzbl -508(%rbp), %r12d + andl $15, %r10d + movaps %xmm0, -528(%rbp) + movzbl -523(%rbp), %r13d + andl $15, %r11d + movaps %xmm0, -624(%rbp) + andl $15, %r12d + movzbl -613(%rbp), %edx + movaps %xmm0, -640(%rbp) + andl $15, %r13d + movzbl -628(%rbp), %ecx + movaps %xmm0, -656(%rbp) + movzbl -643(%rbp), %esi + andl $15, %edx + movaps %xmm0, -672(%rbp) + movzbl -658(%rbp), %edi + andl $15, %ecx + movaps %xmm0, -688(%rbp) + movzbl -192(%rbp,%rax), %eax + movzbl -192(%rbp,%r14), %r14d + andl $15, %esi + movzbl -192(%rbp,%r13), %r13d + movzbl -673(%rbp), %r8d + andl $15, %edi + salq $8, %rax + movzbl -192(%rbp,%r12), %r12d + movzbl -192(%rbp,%r11), %r11d + orq %r14, %rax + andl $15, %r8d + movzbl -192(%rbp,%r10), %r10d + movzbl -192(%rbp,%rdi), %edi + salq $8, %rax + movzbl -192(%rbp,%rsi), %esi + movzbl -192(%rbp,%r8), %r8d + orq %r13, %rax + movzbl -192(%rbp,%rcx), %ecx + movzbl -192(%rbp,%rdx), %edx + salq $8, %rax + salq $8, %r8 + movzbl -192(%rbp,%r9), %r9d + orq %r12, %rax + salq $8, %rax + orq %r11, %rax + movq -2704(%rbp), %r11 + salq $8, %rax + orq %r10, %rax + movzbl -192(%rbp,%r11), %r10d + salq $8, %rax + orq %r10, %rax + salq $8, %rax + orq %rdi, %r8 + movq -2832(%rbp), %rdi + salq $8, %r8 + orq %r9, %rax + orq %rsi, %r8 + movq -2656(%rbp), %rsi + movq %rax, -2704(%rbp) + salq $8, %r8 + orq %rcx, %r8 + movq -2720(%rbp), %rcx + salq $8, %r8 + orq %rdx, %r8 + movzbl -192(%rbp,%rdi), %edx + salq $8, %r8 + orq %rdx, %r8 + movzbl -192(%rbp,%rsi), %edx + salq $8, %r8 + orq %rdx, %r8 + movzbl -192(%rbp,%rcx), %edx + salq $8, %r8 + orq %rdx, %r8 + movq %r8, -2696(%rbp) + movdqa -2704(%rbp), %xmm6 + testb $4, -2672(%rbp) + movdqa -2848(%rbp), %xmm2 + movups %xmm6, (%r15) + je .L435 + movq -2648(%rbp), %rax + movdqa -2624(%rbp), %xmm6 + movdqa %xmm2, %xmm1 + movaps %xmm2, -2880(%rbp) + movdqu 16(%rax), %xmm3 + movdqa %xmm6, %xmm0 + pcmpeqd %xmm3, %xmm0 + psubq %xmm3, %xmm1 + movaps %xmm3, -2704(%rbp) + pand %xmm0, %xmm1 + movdqa %xmm3, %xmm0 + pcmpgtd %xmm6, %xmm0 + por %xmm0, %xmm1 + pcmpeqd %xmm0, %xmm0 + pshufd $245, %xmm1, %xmm1 + pxor %xmm1, %xmm0 + movaps %xmm1, -2864(%rbp) + movmskpd %xmm0, %r12d + movq %r12, %rdi + salq $4, %r12 + call __popcountdi2@PLT + movq -2728(%rbp), %rcx + movdqa -2704(%rbp), %xmm3 + cltq + movdqa (%rcx,%r12), %xmm0 + movq %rax, -2720(%rbp) + movaps %xmm3, -176(%rbp) + movaps %xmm0, -816(%rbp) + movzbl -808(%rbp), %edx + movd %xmm0, %r9d + movaps %xmm0, -704(%rbp) + movzbl -703(%rbp), %eax + andl $15, %r9d + movaps %xmm0, -832(%rbp) + movq %rdx, %rdi + movzbl -823(%rbp), %edx + movaps %xmm0, -800(%rbp) + movq %rax, %rcx + andl $15, %edi + movzbl -793(%rbp), %eax + movaps %xmm0, -848(%rbp) + movq %rdx, %rsi + movzbl -838(%rbp), %edx + andl $15, %ecx + movaps %xmm0, -784(%rbp) + andl $15, %esi + andl $15, %eax + movzbl -778(%rbp), %r14d + andl $15, %edx + movaps %xmm0, -720(%rbp) + movzbl -718(%rbp), %r10d + movaps %xmm0, -736(%rbp) + andl $15, %r14d + movzbl -733(%rbp), %r11d + movaps %xmm0, -752(%rbp) + movzbl -748(%rbp), %r12d + andl $15, %r10d + movaps %xmm0, -768(%rbp) + movzbl -763(%rbp), %r13d + andl $15, %r11d + movq %rcx, -2704(%rbp) + andl $15, %r12d + movq %rdi, -2656(%rbp) + andl $15, %r13d + movq %rsi, -2832(%rbp) + movq %rdx, -2848(%rbp) + movaps %xmm0, -864(%rbp) + movzbl -853(%rbp), %edx + movaps %xmm0, -880(%rbp) + movzbl -868(%rbp), %ecx + movaps %xmm0, -896(%rbp) + movzbl -883(%rbp), %esi + andl $15, %edx + movaps %xmm0, -912(%rbp) + movzbl -898(%rbp), %edi + andl $15, %ecx + movaps %xmm0, -928(%rbp) + movzbl -176(%rbp,%rax), %eax + movzbl -176(%rbp,%r14), %r14d + andl $15, %esi + movzbl -176(%rbp,%r13), %r13d + movzbl -913(%rbp), %r8d + andl $15, %edi + salq $8, %rax + movzbl -176(%rbp,%r12), %r12d + movzbl -176(%rbp,%r11), %r11d + orq %r14, %rax + andl $15, %r8d + movzbl -176(%rbp,%r10), %r10d + movzbl -176(%rbp,%rdi), %edi + salq $8, %rax + movzbl -176(%rbp,%rsi), %esi + movzbl -176(%rbp,%r8), %r8d + orq %r13, %rax + movzbl -176(%rbp,%rcx), %ecx + movzbl -176(%rbp,%rdx), %edx + salq $8, %rax + salq $8, %r8 + movzbl -176(%rbp,%r9), %r9d + orq %r12, %rax + salq $8, %rax + orq %r11, %rax + movq -2704(%rbp), %r11 + salq $8, %rax + orq %r10, %rax + movzbl -176(%rbp,%r11), %r10d + salq $8, %rax + orq %r10, %rax + salq $8, %rax + orq %rdi, %r8 + movq -2656(%rbp), %rdi + salq $8, %r8 + orq %r9, %rax + orq %rsi, %r8 + movq -2832(%rbp), %rsi + movq %rax, -2704(%rbp) + salq $8, %r8 + orq %rcx, %r8 + salq $8, %r8 + orq %rdx, %r8 + movq -2848(%rbp), %rdx + salq $8, %r8 + movzbl -176(%rbp,%rdx), %edx + orq %rdx, %r8 + movzbl -176(%rbp,%rsi), %edx + salq $8, %r8 + orq %rdx, %r8 + movzbl -176(%rbp,%rdi), %edx + salq $8, %r8 + orq %rdx, %r8 + movq %r8, -2696(%rbp) + movdqa -2752(%rbp), %xmm6 + movdqa -2784(%rbp), %xmm0 + movdqa -2864(%rbp), %xmm1 + movq -2720(%rbp), %xmm4 + movdqa %xmm6, %xmm5 + movdqa -2880(%rbp), %xmm2 + punpcklqdq %xmm4, %xmm4 + pcmpeqd %xmm4, %xmm5 + psubq %xmm4, %xmm0 + pcmpgtd %xmm6, %xmm4 + pand %xmm5, %xmm0 + por %xmm4, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L436 + movdqa -2704(%rbp), %xmm6 + movq %xmm6, (%rbx) +.L436: + movhlps %xmm0, %xmm6 + movq %xmm6, %rax + testq %rax, %rax + je .L437 + movdqa -2704(%rbp), %xmm6 + movhps %xmm6, 8(%rbx) +.L437: + movq -2720(%rbp), %rax + movmskpd %xmm1, %r12d + movaps %xmm3, -2704(%rbp) + movq %r12, %rdi + movaps %xmm2, -2864(%rbp) + salq $4, %r12 + leaq (%rbx,%rax,8), %rbx + call __popcountdi2@PLT + movdqa -2704(%rbp), %xmm3 + movslq %eax, %r8 + movq -2728(%rbp), %rax + movaps %xmm3, -160(%rbp) + movdqa (%rax,%r12), %xmm0 + movaps %xmm0, -1056(%rbp) + movzbl -1048(%rbp), %edx + movd %xmm0, %r9d + movaps %xmm0, -1072(%rbp) + andl $15, %r9d + movq %rdx, %rcx + movzbl -1063(%rbp), %edx + movaps %xmm0, -1088(%rbp) + movaps %xmm0, -944(%rbp) + movzbl -943(%rbp), %eax + andl $15, %ecx + movq %rdx, %rdi + movzbl -1078(%rbp), %edx + movaps %xmm0, -1104(%rbp) + andl $15, %eax + andl $15, %edi + movaps %xmm0, -1040(%rbp) + andl $15, %edx + movq %rax, -2704(%rbp) + movzbl -1033(%rbp), %eax + movq %rdx, -2832(%rbp) + movzbl -1093(%rbp), %edx + movq %rdi, -2656(%rbp) + andl $15, %eax + movaps %xmm0, -1024(%rbp) + movq %rdx, %rdi + movzbl -1018(%rbp), %r14d + andl $15, %edi + movq %rcx, -2720(%rbp) + movaps %xmm0, -960(%rbp) + andl $15, %r14d + movzbl -958(%rbp), %r10d + movaps %xmm0, -976(%rbp) + movzbl -973(%rbp), %r11d + movaps %xmm0, -992(%rbp) + movzbl -988(%rbp), %r12d + andl $15, %r10d + movaps %xmm0, -1008(%rbp) + movzbl -1003(%rbp), %r13d + andl $15, %r11d + movq %rdi, -2848(%rbp) + andl $15, %r12d + movaps %xmm0, -1120(%rbp) + andl $15, %r13d + movzbl -1108(%rbp), %edx + movaps %xmm0, -1136(%rbp) + movzbl -1123(%rbp), %ecx + movaps %xmm0, -1152(%rbp) + movzbl -1138(%rbp), %esi + andl $15, %edx + movaps %xmm0, -1168(%rbp) + movzbl -160(%rbp,%rax), %eax + movzbl -160(%rbp,%r14), %r14d + andl $15, %ecx + movzbl -160(%rbp,%r13), %r13d + movzbl -1153(%rbp), %edi + andl $15, %esi + salq $8, %rax + movzbl -160(%rbp,%r12), %r12d + movzbl -160(%rbp,%r11), %r11d + orq %r14, %rax + andl $15, %edi + movzbl -160(%rbp,%r10), %r10d + movzbl -160(%rbp,%rsi), %esi + salq $8, %rax + movzbl -160(%rbp,%rdi), %edi + movzbl -160(%rbp,%rcx), %ecx + orq %r13, %rax + movzbl -160(%rbp,%rdx), %edx + movzbl -160(%rbp,%r9), %r9d + salq $8, %rax + salq $8, %rdi + orq %r12, %rax + salq $8, %rax + orq %r11, %rax + movq -2704(%rbp), %r11 + salq $8, %rax + orq %r10, %rax + movzbl -160(%rbp,%r11), %r10d + movq -2848(%rbp), %r11 + salq $8, %rax + orq %r10, %rax + salq $8, %rax + orq %rsi, %rdi + movq -2656(%rbp), %rsi + salq $8, %rdi + orq %r9, %rax + orq %rcx, %rdi + movq -2720(%rbp), %rcx + movq %rax, -2704(%rbp) + salq $8, %rdi + orq %rdx, %rdi + movzbl -160(%rbp,%r11), %edx + salq $8, %rdi + orq %rdx, %rdi + movq -2832(%rbp), %rdx + salq $8, %rdi + movzbl -160(%rbp,%rdx), %edx + orq %rdx, %rdi + movzbl -160(%rbp,%rsi), %edx + salq $8, %rdi + orq %rdx, %rdi + movzbl -160(%rbp,%rcx), %edx + salq $8, %rdi + orq %rdx, %rdi + movq %rdi, -2696(%rbp) + movq -2640(%rbp), %rax + movdqa -2704(%rbp), %xmm6 + movdqa -2864(%rbp), %xmm2 + movups %xmm6, (%r15,%rax,8) + addq %r8, %rax + cmpq $5, -2688(%rbp) + movq %rax, -2640(%rbp) + jbe .L435 + movq -2648(%rbp), %rcx + movdqa -2624(%rbp), %xmm6 + movdqa %xmm2, %xmm1 + movaps %xmm2, -2880(%rbp) + movdqu 32(%rcx), %xmm3 + movdqa %xmm6, %xmm0 + pcmpeqd %xmm3, %xmm0 + psubq %xmm3, %xmm1 + movaps %xmm3, -2704(%rbp) + pand %xmm0, %xmm1 + movdqa %xmm3, %xmm0 + pcmpgtd %xmm6, %xmm0 + por %xmm0, %xmm1 + pcmpeqd %xmm0, %xmm0 + pshufd $245, %xmm1, %xmm1 + pxor %xmm1, %xmm0 + movaps %xmm1, -2864(%rbp) + movmskpd %xmm0, %r12d + movq %r12, %rdi + salq $4, %r12 + call __popcountdi2@PLT + movdqa -2704(%rbp), %xmm3 + movslq %eax, %rcx + movq -2728(%rbp), %rax + movq %rcx, -2720(%rbp) + movdqa (%rax,%r12), %xmm0 + movaps %xmm3, -144(%rbp) + movaps %xmm0, -1184(%rbp) + movzbl -1183(%rbp), %eax + movd %xmm0, %r10d + movaps %xmm0, -1200(%rbp) + andl $15, %r10d + andl $15, %eax + movaps %xmm0, -1296(%rbp) + movzbl -1288(%rbp), %edx + movq %rax, -2704(%rbp) + movzbl -1198(%rbp), %eax + movaps %xmm0, -1280(%rbp) + movq %rdx, %rdi + movaps %xmm0, -1312(%rbp) + movq %rax, %rsi + movzbl -1303(%rbp), %edx + andl $15, %edi + movzbl -1273(%rbp), %eax + movaps %xmm0, -1264(%rbp) + movzbl -1258(%rbp), %r14d + andl $15, %esi + andl $15, %edx + movaps %xmm0, -1216(%rbp) + movzbl -1213(%rbp), %r11d + andl $15, %eax + movaps %xmm0, -1232(%rbp) + andl $15, %r14d + movzbl -1228(%rbp), %r12d + movaps %xmm0, -1248(%rbp) + movzbl -1243(%rbp), %r13d + andl $15, %r11d + movq %rsi, -2656(%rbp) + andl $15, %r12d + movq %rdi, -2832(%rbp) + andl $15, %r13d + movaps %xmm0, -1328(%rbp) + movq %rdx, -2848(%rbp) + movzbl -1318(%rbp), %edx + movaps %xmm0, -1344(%rbp) + movzbl -1333(%rbp), %ecx + movaps %xmm0, -1360(%rbp) + movzbl -1348(%rbp), %esi + andl $15, %edx + movaps %xmm0, -1376(%rbp) + movzbl -1363(%rbp), %edi + andl $15, %ecx + movaps %xmm0, -1392(%rbp) + movzbl -1378(%rbp), %r8d + andl $15, %esi + movaps %xmm0, -1408(%rbp) + movzbl -144(%rbp,%rax), %eax + movzbl -144(%rbp,%r14), %r14d + andl $15, %edi + movzbl -144(%rbp,%r13), %r13d + movzbl -1393(%rbp), %r9d + andl $15, %r8d + salq $8, %rax + movzbl -144(%rbp,%r12), %r12d + movzbl -144(%rbp,%r11), %r11d + orq %r14, %rax + movq -2656(%rbp), %r14 + andl $15, %r9d + movzbl -144(%rbp,%r8), %r8d + salq $8, %rax + movzbl -144(%rbp,%rdi), %edi + movzbl -144(%rbp,%r9), %r9d + orq %r13, %rax + movzbl -144(%rbp,%rsi), %esi + movzbl -144(%rbp,%rcx), %ecx + salq $8, %rax + salq $8, %r9 + movzbl -144(%rbp,%rdx), %edx + movzbl -144(%rbp,%r10), %r10d + orq %r12, %rax + salq $8, %rax + orq %r11, %rax + movzbl -144(%rbp,%r14), %r11d + salq $8, %rax + orq %r11, %rax + movq -2704(%rbp), %r11 + salq $8, %rax + movzbl -144(%rbp,%r11), %r11d + orq %r11, %rax + salq $8, %rax + orq %r8, %r9 + salq $8, %r9 + orq %r10, %rax + orq %rdi, %r9 + movq -2832(%rbp), %rdi + salq $8, %r9 + orq %rsi, %r9 + salq $8, %r9 + orq %rcx, %r9 + salq $8, %r9 + orq %rdx, %r9 + movq -2848(%rbp), %rdx + salq $8, %r9 + movzbl -144(%rbp,%rdx), %edx + orq %rdx, %r9 + movzbl -144(%rbp,%rdi), %edx + movq %rax, -2704(%rbp) + movdqa -2752(%rbp), %xmm6 + movdqa -2784(%rbp), %xmm0 + salq $8, %r9 + movq -2720(%rbp), %xmm4 + orq %rdx, %r9 + movdqa -2864(%rbp), %xmm1 + movdqa %xmm6, %xmm5 + movq %r9, -2696(%rbp) + movdqa -2880(%rbp), %xmm2 + punpcklqdq %xmm4, %xmm4 + pcmpeqd %xmm4, %xmm5 + psubq %xmm4, %xmm0 + pcmpgtd %xmm6, %xmm4 + pand %xmm5, %xmm0 + por %xmm4, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L438 + movdqa -2704(%rbp), %xmm6 + movq %xmm6, (%rbx) +.L438: + movhlps %xmm0, %xmm6 + movq %xmm6, %rax + testq %rax, %rax + je .L439 + movdqa -2704(%rbp), %xmm6 + movhps %xmm6, 8(%rbx) +.L439: + movq -2720(%rbp), %rax + movmskpd %xmm1, %r12d + movaps %xmm3, -2704(%rbp) + movq %r12, %rdi + movaps %xmm2, -2864(%rbp) + salq $4, %r12 + leaq (%rbx,%rax,8), %rbx + call __popcountdi2@PLT + movdqa -2704(%rbp), %xmm3 + movslq %eax, %r8 + movq -2728(%rbp), %rax + movaps %xmm3, -128(%rbp) + movdqa (%rax,%r12), %xmm0 + movaps %xmm0, -1424(%rbp) + movzbl -1423(%rbp), %eax + movd %xmm0, %ecx + movaps %xmm0, -1440(%rbp) + andl $15, %ecx + andl $15, %eax + movaps %xmm0, -1536(%rbp) + movzbl -1528(%rbp), %edx + movq %rax, -2720(%rbp) + movzbl -1438(%rbp), %eax + movaps %xmm0, -1520(%rbp) + movq %rdx, %rdi + movq %rax, %rsi + movaps %xmm0, -1552(%rbp) + movzbl -1513(%rbp), %eax + andl $15, %edi + movzbl -1543(%rbp), %edx + movaps %xmm0, -1504(%rbp) + movzbl -1498(%rbp), %r14d + andl $15, %esi + andl $15, %eax + movq %rsi, -2656(%rbp) + andl $15, %edx + movq %rdi, -2832(%rbp) + andl $15, %r14d + movq %rdx, -2848(%rbp) + movaps %xmm0, -1456(%rbp) + movzbl -1453(%rbp), %r11d + movaps %xmm0, -1472(%rbp) + movzbl -1468(%rbp), %r12d + movaps %xmm0, -1488(%rbp) + movzbl -1483(%rbp), %r13d + andl $15, %r11d + movaps %xmm0, -1568(%rbp) + movzbl -1558(%rbp), %edx + andl $15, %r12d + movq %rcx, -2704(%rbp) + andl $15, %r13d + movaps %xmm0, -1584(%rbp) + movzbl -1573(%rbp), %ecx + andl $15, %edx + movaps %xmm0, -1600(%rbp) + movzbl -1588(%rbp), %esi + movaps %xmm0, -1616(%rbp) + movzbl -1603(%rbp), %edi + andl $15, %ecx + movaps %xmm0, -1632(%rbp) + movzbl -1618(%rbp), %r9d + andl $15, %esi + movaps %xmm0, -1648(%rbp) + movzbl -128(%rbp,%rax), %eax + andl $15, %edi + movzbl -128(%rbp,%r14), %r14d + movzbl -128(%rbp,%r13), %r13d + movzbl -128(%rbp,%r12), %r12d + andl $15, %r9d + salq $8, %rax + movzbl -128(%rbp,%rdi), %edi + movzbl -128(%rbp,%rsi), %esi + orq %r14, %rax + movzbl -128(%rbp,%r11), %r11d + movzbl -1633(%rbp), %r10d + salq $8, %rax + movzbl -128(%rbp,%r9), %r9d + movzbl -128(%rbp,%rcx), %ecx + orq %r13, %rax + movq -2720(%rbp), %r14 + andl $15, %r10d + movzbl -128(%rbp,%rdx), %edx + salq $8, %rax + movzbl -128(%rbp,%r10), %r10d + orq %r12, %rax + movq -2656(%rbp), %r12 + salq $8, %rax + salq $8, %r10 + orq %r11, %rax + movzbl -128(%rbp,%r12), %r11d + salq $8, %rax + orq %r11, %rax + movzbl -128(%rbp,%r14), %r11d + salq $8, %rax + orq %r11, %rax + movq -2704(%rbp), %r11 + salq $8, %rax + orq %r9, %r10 + salq $8, %r10 + movzbl -128(%rbp,%r11), %r11d + orq %rdi, %r10 + movq -2832(%rbp), %rdi + salq $8, %r10 + orq %r11, %rax + orq %rsi, %r10 + movq %rax, -2704(%rbp) + salq $8, %r10 + orq %rcx, %r10 + salq $8, %r10 + orq %rdx, %r10 + movq -2848(%rbp), %rdx + salq $8, %r10 + movzbl -128(%rbp,%rdx), %edx + orq %rdx, %r10 + movzbl -128(%rbp,%rdi), %edx + salq $8, %r10 + orq %rdx, %r10 + movq %r10, -2696(%rbp) + movq -2640(%rbp), %rax + movdqa -2704(%rbp), %xmm6 + movdqa -2864(%rbp), %xmm2 + movups %xmm6, (%r15,%rax,8) + addq %r8, %rax + movq %rax, -2640(%rbp) +.L435: + movq -2688(%rbp), %rcx + leaq -2(%rcx), %rax + leaq 1(%rcx), %rdx + movq -2640(%rbp), %rcx + andq $-2, %rax + addq $2, %rax + cmpq $2, %rdx + movl $2, %edx + cmovbe %rdx, %rax + leaq 0(,%rcx,8), %r13 +.L432: + movq -2688(%rbp), %rcx + cmpq %rax, %rcx + je .L440 + movdqa -2752(%rbp), %xmm7 + subq %rax, %rcx + movdqa %xmm2, %xmm6 + movdqa -2784(%rbp), %xmm1 + movq %rcx, %xmm0 + movq -2648(%rbp), %rcx + movaps %xmm2, -2896(%rbp) + punpcklqdq %xmm0, %xmm0 + movdqa %xmm7, %xmm3 + pcmpeqd %xmm0, %xmm3 + movdqu (%rcx,%rax,8), %xmm5 + psubq %xmm0, %xmm1 + pcmpgtd %xmm7, %xmm0 + psubq %xmm5, %xmm6 + movaps %xmm5, -2688(%rbp) + pand %xmm3, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm6, %xmm1 + movdqa -2624(%rbp), %xmm6 + pshufd $245, %xmm0, %xmm4 + movdqa %xmm6, %xmm3 + movaps %xmm4, -2864(%rbp) + pcmpeqd %xmm5, %xmm3 + pand %xmm3, %xmm1 + movdqa %xmm5, %xmm3 + pcmpgtd %xmm6, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm6 + movaps %xmm1, -2880(%rbp) + pandn %xmm4, %xmm6 + movmskpd %xmm6, %r12d + movq %r12, %rdi + salq $4, %r12 + call __popcountdi2@PLT + movq -2728(%rbp), %rcx + movdqa -2688(%rbp), %xmm5 + cltq + movdqa (%rcx,%r12), %xmm0 + movq %rax, -2704(%rbp) + movaps %xmm5, -112(%rbp) + movaps %xmm0, -1664(%rbp) + movzbl -1663(%rbp), %eax + movd %xmm0, %ecx + movaps %xmm0, -1680(%rbp) + andl $15, %ecx + movq %rax, %rdi + movzbl -1678(%rbp), %eax + movaps %xmm0, -1760(%rbp) + movaps %xmm0, -1776(%rbp) + movzbl -1768(%rbp), %edx + andl $15, %edi + movq %rax, %rsi + movzbl -1753(%rbp), %eax + movaps %xmm0, -1744(%rbp) + movzbl -1738(%rbp), %r14d + andl $15, %esi + andl $15, %edx + movaps %xmm0, -1696(%rbp) + movzbl -1693(%rbp), %r10d + andl $15, %eax + movaps %xmm0, -1712(%rbp) + andl $15, %r14d + movzbl -1708(%rbp), %r11d + movaps %xmm0, -1728(%rbp) + movzbl -1723(%rbp), %r12d + andl $15, %r10d + movq %rcx, -2688(%rbp) + andl $15, %r11d + movq %rdi, -2720(%rbp) + andl $15, %r12d + movq %rsi, -2656(%rbp) + movaps %xmm0, -1792(%rbp) + movq %rdx, -2832(%rbp) + movzbl -1783(%rbp), %edx + movaps %xmm0, -1808(%rbp) + movaps %xmm0, -1824(%rbp) + movq %rdx, %r8 + movzbl -1813(%rbp), %ecx + movzbl -1798(%rbp), %edx + movaps %xmm0, -1840(%rbp) + andl $15, %r8d + movzbl -1828(%rbp), %esi + movaps %xmm0, -1856(%rbp) + andl $15, %edx + andl $15, %ecx + movzbl -1843(%rbp), %edi + movaps %xmm0, -1872(%rbp) + andl $15, %esi + movaps %xmm0, -1888(%rbp) + movzbl -112(%rbp,%rax), %eax + movzbl -112(%rbp,%r14), %r14d + andl $15, %edi + movzbl -112(%rbp,%r12), %r12d + movzbl -112(%rbp,%r11), %r11d + movq %r8, -2848(%rbp) + salq $8, %rax + movzbl -112(%rbp,%r10), %r10d + movzbl -1873(%rbp), %r9d + orq %r14, %rax + movzbl -112(%rbp,%rdi), %edi + movzbl -112(%rbp,%rsi), %esi + salq $8, %rax + movq -2720(%rbp), %r14 + andl $15, %r9d + movzbl -1858(%rbp), %r8d + orq %r12, %rax + movzbl -112(%rbp,%r9), %r9d + movzbl -112(%rbp,%rcx), %ecx + salq $8, %rax + andl $15, %r8d + movzbl -112(%rbp,%rdx), %edx + orq %r11, %rax + movzbl -112(%rbp,%r8), %r8d + movq -2688(%rbp), %r11 + salq $8, %rax + orq %r10, %rax + movq -2656(%rbp), %r10 + salq $8, %rax + movzbl -112(%rbp,%r10), %r10d + orq %r10, %rax + movzbl -112(%rbp,%r14), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -112(%rbp,%r11), %r10d + salq $8, %rax + salq $8, %r9 + orq %r8, %r9 + movq -2848(%rbp), %r8 + orq %r10, %rax + salq $8, %r9 + orq %rdi, %r9 + salq $8, %r9 + orq %rsi, %r9 + salq $8, %r9 + orq %rcx, %r9 + salq $8, %r9 + orq %rdx, %r9 + movzbl -112(%rbp,%r8), %edx + movq %rax, -2688(%rbp) + movdqa -2752(%rbp), %xmm7 + movdqa -2784(%rbp), %xmm0 + salq $8, %r9 + movq -2704(%rbp), %xmm3 + orq %rdx, %r9 + movq -2832(%rbp), %rdx + movdqa %xmm7, %xmm6 + salq $8, %r9 + movdqa -2864(%rbp), %xmm4 + movdqa -2880(%rbp), %xmm1 + punpcklqdq %xmm3, %xmm3 + movzbl -112(%rbp,%rdx), %edx + movdqa -2896(%rbp), %xmm2 + pcmpeqd %xmm3, %xmm6 + psubq %xmm3, %xmm0 + pcmpgtd %xmm7, %xmm3 + orq %rdx, %r9 + movq %r9, -2680(%rbp) + pand %xmm6, %xmm0 + por %xmm3, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L441 + movdqa -2688(%rbp), %xmm6 + movq %xmm6, (%rbx) +.L441: + movhlps %xmm0, %xmm6 + movq %xmm6, %rax + testq %rax, %rax + je .L442 + movdqa -2688(%rbp), %xmm6 + movhps %xmm6, 8(%rbx) +.L442: + pand %xmm4, %xmm1 + movq -2704(%rbp), %rax + movaps %xmm5, -2688(%rbp) + movmskpd %xmm1, %r12d + movaps %xmm2, -2864(%rbp) + movq %r12, %rdi + leaq (%rbx,%rax,8), %rbx + salq $4, %r12 + call __popcountdi2@PLT + movq -2728(%rbp), %rcx + movdqa -2688(%rbp), %xmm5 + movl %eax, -2848(%rbp) + movdqa (%rcx,%r12), %xmm0 + movaps %xmm5, -96(%rbp) + movaps %xmm0, -1904(%rbp) + movzbl -1903(%rbp), %eax + movd %xmm0, %ecx + movaps %xmm0, -2016(%rbp) + movzbl -2008(%rbp), %edx + andl $15, %ecx + movq %rax, %rdi + movaps %xmm0, -1920(%rbp) + movzbl -1918(%rbp), %eax + andl $15, %edx + movaps %xmm0, -2032(%rbp) + andl $15, %edi + movq %rdx, -2656(%rbp) + movzbl -2023(%rbp), %edx + movq %rax, %rsi + movaps %xmm0, -2000(%rbp) + movzbl -1993(%rbp), %eax + andl $15, %esi + movaps %xmm0, -1984(%rbp) + movq %rdx, %r8 + movzbl -1978(%rbp), %r14d + andl $15, %r8d + andl $15, %eax + movq %rcx, -2688(%rbp) + movq %rdi, -2704(%rbp) + andl $15, %r14d + movq %rsi, -2720(%rbp) + movq %r8, -2832(%rbp) + movaps %xmm0, -1936(%rbp) + movzbl -1933(%rbp), %r10d + movaps %xmm0, -1952(%rbp) + movzbl -1948(%rbp), %r11d + movaps %xmm0, -1968(%rbp) + movzbl -1963(%rbp), %r12d + andl $15, %r10d + movaps %xmm0, -2048(%rbp) + movzbl -2038(%rbp), %edx + andl $15, %r11d + movaps %xmm0, -2064(%rbp) + andl $15, %r12d + movzbl -2053(%rbp), %ecx + movaps %xmm0, -2080(%rbp) + movzbl -2068(%rbp), %esi + andl $15, %edx + movaps %xmm0, -2096(%rbp) + movzbl -2083(%rbp), %edi + andl $15, %ecx + movaps %xmm0, -2112(%rbp) + movzbl -2098(%rbp), %r8d + andl $15, %esi + movaps %xmm0, -2128(%rbp) + movzbl -96(%rbp,%rax), %eax + andl $15, %edi + movzbl -96(%rbp,%r14), %r14d + movzbl -96(%rbp,%r12), %r12d + movzbl -96(%rbp,%r11), %r11d + andl $15, %r8d + salq $8, %rax + movzbl -96(%rbp,%rdi), %edi + movzbl -96(%rbp,%rsi), %esi + orq %r14, %rax + movzbl -96(%rbp,%r10), %r10d + movzbl -2113(%rbp), %r9d + salq $8, %rax + movzbl -96(%rbp,%r8), %r8d + movzbl -96(%rbp,%rcx), %ecx + orq %r12, %rax + movq -2704(%rbp), %r14 + andl $15, %r9d + movzbl -96(%rbp,%rdx), %edx + salq $8, %rax + movzbl -96(%rbp,%r9), %r9d + orq %r11, %rax + movq -2688(%rbp), %r11 + salq $8, %rax + salq $8, %r9 + orq %r10, %rax + movq -2720(%rbp), %r10 + salq $8, %rax + movzbl -96(%rbp,%r10), %r10d + orq %r10, %rax + movzbl -96(%rbp,%r14), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -96(%rbp,%r11), %r10d + salq $8, %rax + orq %r8, %r9 + movq -2832(%rbp), %r8 + salq $8, %r9 + orq %r10, %rax + orq %rdi, %r9 + movq %rax, -2688(%rbp) + salq $8, %r9 + orq %rsi, %r9 + salq $8, %r9 + orq %rcx, %r9 + salq $8, %r9 + orq %rdx, %r9 + movzbl -96(%rbp,%r8), %edx + salq $8, %r9 + orq %rdx, %r9 + movq -2656(%rbp), %rdx + salq $8, %r9 + movzbl -96(%rbp,%rdx), %edx + orq %rdx, %r9 + movq %r9, -2680(%rbp) + movdqa -2688(%rbp), %xmm6 + movslq -2848(%rbp), %rax + addq %rax, -2640(%rbp) + movq -2640(%rbp), %rcx + movdqa -2864(%rbp), %xmm2 + movups %xmm6, (%r15,%r13) + leaq 0(,%rcx,8), %r13 +.L440: + movq -2648(%rbp), %rax + movq -2672(%rbp), %r12 + subq -2640(%rbp), %r12 + leaq (%rax,%r12,8), %rax + cmpl $8, %r13d + jnb .L443 + testl %r13d, %r13d + jne .L597 +.L444: + cmpl $8, %r13d + jnb .L447 +.L601: + testl %r13d, %r13d + jne .L598 +.L448: + movq -2672(%rbp), %rcx + movq %rbx, %rax + subq -2648(%rbp), %rax + sarq $3, %rax + subq %rax, %rcx + subq %rax, %r12 + movq %rax, -2848(%rbp) + movq %rcx, -2832(%rbp) + leaq (%rbx,%r12,8), %rax + je .L493 + movdqu (%rbx), %xmm6 + leaq 64(%rbx), %rdx + leaq -64(%rax), %rcx + movaps %xmm6, -2864(%rbp) + movdqu 16(%rbx), %xmm6 + movaps %xmm6, -2880(%rbp) + movdqu 32(%rbx), %xmm6 + movaps %xmm6, -2896(%rbp) + movdqu 48(%rbx), %xmm6 + movaps %xmm6, -2912(%rbp) + movdqu -64(%rax), %xmm6 + movaps %xmm6, -2928(%rbp) + movdqu -48(%rax), %xmm6 + movaps %xmm6, -2944(%rbp) + movdqu -32(%rax), %xmm6 + movaps %xmm6, -2960(%rbp) + movdqu -16(%rax), %xmm6 + movaps %xmm6, -2976(%rbp) + cmpq %rcx, %rdx + je .L494 + movq %r15, -2984(%rbp) + xorl %r13d, %r13d + movl $2, %r14d + movq %rcx, %r15 + movaps %xmm2, -2640(%rbp) + jmp .L455 + .p2align 4,,10 + .p2align 3 +.L600: + movdqu -64(%r15), %xmm5 + movdqu -48(%r15), %xmm4 + prefetcht0 -256(%r15) + subq $64, %r15 + movdqu 32(%r15), %xmm3 + movdqu 48(%r15), %xmm1 +.L454: + movdqa -2624(%rbp), %xmm7 + movdqa -2640(%rbp), %xmm0 + movq %rdx, -2656(%rbp) + movaps %xmm1, -2720(%rbp) + movdqa %xmm7, %xmm2 + psubq %xmm5, %xmm0 + movaps %xmm3, -2704(%rbp) + pcmpeqd %xmm5, %xmm2 + movaps %xmm4, -2688(%rbp) + pand %xmm2, %xmm0 + movdqa %xmm5, %xmm2 + pcmpgtd %xmm7, %xmm2 + pshufd $78, %xmm5, %xmm7 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm6 + movdqa %xmm0, %xmm2 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm6 + punpckhqdq %xmm0, %xmm2 + pandn %xmm6, %xmm2 + movdqa %xmm2, %xmm6 + pand %xmm7, %xmm2 + pandn %xmm5, %xmm6 + por %xmm6, %xmm2 + movaps %xmm2, -2672(%rbp) + call __popcountdi2@PLT + movdqa -2672(%rbp), %xmm2 + leaq -2(%r12,%r13), %rsi + movdqa -2624(%rbp), %xmm7 + movdqa -2688(%rbp), %xmm4 + movdqa -2640(%rbp), %xmm0 + cltq + movups %xmm2, (%rbx,%r13,8) + addq $2, %r13 + movups %xmm2, (%rbx,%rsi,8) + movdqa %xmm7, %xmm2 + psubq %xmm4, %xmm0 + subq %rax, %r13 + pcmpeqd %xmm4, %xmm2 + pshufd $78, %xmm4, %xmm6 + pand %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pcmpgtd %xmm7, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm5 + movdqa %xmm0, %xmm2 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm5 + punpckhqdq %xmm0, %xmm2 + pandn %xmm5, %xmm2 + movdqa %xmm2, %xmm5 + pand %xmm6, %xmm2 + pandn %xmm4, %xmm5 + por %xmm5, %xmm2 + movaps %xmm2, -2672(%rbp) + call __popcountdi2@PLT + movdqa -2672(%rbp), %xmm2 + leaq -4(%r12,%r13), %rsi + movdqa -2624(%rbp), %xmm7 + movdqa -2704(%rbp), %xmm3 + movdqa -2640(%rbp), %xmm0 + cltq + movups %xmm2, (%rbx,%r13,8) + movups %xmm2, (%rbx,%rsi,8) + movdqa %xmm7, %xmm2 + psubq %xmm3, %xmm0 + movq %r14, %rsi + pcmpeqd %xmm3, %xmm2 + pshufd $78, %xmm3, %xmm5 + subq %rax, %rsi + addq %rsi, %r13 + pand %xmm2, %xmm0 + movdqa %xmm3, %xmm2 + pcmpgtd %xmm7, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm4 + movdqa %xmm0, %xmm2 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm4 + punpckhqdq %xmm0, %xmm2 + pandn %xmm4, %xmm2 + movdqa %xmm2, %xmm4 + pand %xmm5, %xmm2 + pandn %xmm3, %xmm4 + por %xmm4, %xmm2 + movaps %xmm2, -2672(%rbp) + call __popcountdi2@PLT + movdqa -2672(%rbp), %xmm2 + leaq -6(%r12,%r13), %rdi + movq %r14, %rcx + movdqa -2624(%rbp), %xmm7 + movdqa -2720(%rbp), %xmm1 + cltq + subq $8, %r12 + movups %xmm2, (%rbx,%r13,8) + movdqa -2640(%rbp), %xmm0 + subq %rax, %rcx + movups %xmm2, (%rbx,%rdi,8) + movdqa %xmm7, %xmm2 + pshufd $78, %xmm1, %xmm4 + addq %rcx, %r13 + pcmpeqd %xmm1, %xmm2 + psubq %xmm1, %xmm0 + pand %xmm2, %xmm0 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm7, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm2 + pandn %xmm3, %xmm2 + movdqa %xmm2, %xmm3 + pandn %xmm1, %xmm3 + movdqa %xmm2, %xmm1 + pand %xmm4, %xmm1 + por %xmm3, %xmm1 + movaps %xmm1, -2672(%rbp) + call __popcountdi2@PLT + movdqa -2672(%rbp), %xmm1 + leaq 0(%r13,%r12), %rsi + movq -2656(%rbp), %rdx + cltq + movups %xmm1, (%rbx,%r13,8) + movups %xmm1, (%rbx,%rsi,8) + movq %r14, %rsi + subq %rax, %rsi + addq %rsi, %r13 + cmpq %r15, %rdx + je .L599 +.L455: + movq %rdx, %rax + subq %rbx, %rax + sarq $3, %rax + subq %r13, %rax + cmpq $8, %rax + ja .L600 + movdqu (%rdx), %xmm5 + movdqu 16(%rdx), %xmm4 + prefetcht0 256(%rdx) + addq $64, %rdx + movdqu -32(%rdx), %xmm3 + movdqu -16(%rdx), %xmm1 + jmp .L454 + .p2align 4,,10 + .p2align 3 +.L429: + cmpq $23, %rdx + je .L592 +.L428: + movq %rdx, %rcx + addq $1, %rdx + cmpq %rax, (%r15,%rdx,8) + je .L429 + movl $12, %edx + subq $11, %rcx + subq %rbx, %rdx + cmpq %rdx, %rcx + jb .L427 +.L592: + movq (%r15,%rbx,8), %rax + jmp .L427 + .p2align 4,,10 + .p2align 3 +.L595: + movq %rdx, %rbx + movl $8, %edx + subq %rax, %rdx + leaq -8(%rax,%rbx), %r11 + movq %r8, %rax + leaq (%rdi,%rdx,8), %r13 + jmp .L381 + .p2align 4,,10 + .p2align 3 +.L599: + movdqa -2640(%rbp), %xmm2 + movq -2984(%rbp), %r15 + leaq (%rbx,%r13,8), %rdx + leaq (%r12,%r13), %r14 + addq $2, %r13 +.L452: + movdqa -2864(%rbp), %xmm7 + movdqa -2624(%rbp), %xmm6 + movdqa %xmm2, %xmm0 + movq %rdx, -2688(%rbp) + movaps %xmm2, -2672(%rbp) + movdqa %xmm7, %xmm1 + psubq %xmm7, %xmm0 + pshufd $78, %xmm7, %xmm4 + pcmpeqd %xmm6, %xmm1 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -2640(%rbp) + call __popcountdi2@PLT + movdqa -2640(%rbp), %xmm1 + movdqa -2880(%rbp), %xmm7 + movq -2688(%rbp), %rdx + movdqa -2624(%rbp), %xmm6 + cltq + movdqa -2672(%rbp), %xmm2 + pshufd $78, %xmm7, %xmm4 + subq %rax, %r13 + movups %xmm1, (%rdx) + movups %xmm1, -16(%rbx,%r14,8) + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -2640(%rbp) + call __popcountdi2@PLT + movdqa -2640(%rbp), %xmm1 + leaq -4(%r12,%r13), %rdx + movdqa -2896(%rbp), %xmm7 + movdqa -2624(%rbp), %xmm6 + movdqa -2672(%rbp), %xmm2 + cltq + movups %xmm1, (%rbx,%r13,8) + pshufd $78, %xmm7, %xmm4 + subq %rax, %r13 + movups %xmm1, (%rbx,%rdx,8) + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + leaq 2(%r13), %r14 + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + movl $2, %r13d + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -2640(%rbp) + call __popcountdi2@PLT + movdqa -2912(%rbp), %xmm7 + leaq -6(%r12,%r14), %rdx + movdqa -2640(%rbp), %xmm1 + movdqa -2624(%rbp), %xmm6 + movdqa -2672(%rbp), %xmm2 + cltq + movups %xmm1, (%rbx,%r14,8) + pshufd $78, %xmm7, %xmm4 + movups %xmm1, (%rbx,%rdx,8) + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + movq %r13, %rdx + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + subq %rax, %rdx + addq %rdx, %r14 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -2640(%rbp) + call __popcountdi2@PLT + movdqa -2928(%rbp), %xmm7 + leaq -8(%r12,%r14), %rdx + movdqa -2640(%rbp), %xmm1 + movdqa -2624(%rbp), %xmm6 + movdqa -2672(%rbp), %xmm2 + cltq + movups %xmm1, (%rbx,%r14,8) + pshufd $78, %xmm7, %xmm4 + movups %xmm1, (%rbx,%rdx,8) + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + movq %r13, %rdx + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + subq %rax, %rdx + addq %rdx, %r14 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -2640(%rbp) + call __popcountdi2@PLT + movdqa -2944(%rbp), %xmm7 + leaq -10(%r12,%r14), %rdx + movdqa -2640(%rbp), %xmm1 + movdqa -2624(%rbp), %xmm6 + movdqa -2672(%rbp), %xmm2 + cltq + movups %xmm1, (%rbx,%r14,8) + pshufd $78, %xmm7, %xmm4 + movups %xmm1, (%rbx,%rdx,8) + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + movq %r13, %rdx + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + subq %rax, %rdx + addq %rdx, %r14 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -2640(%rbp) + call __popcountdi2@PLT + movdqa -2640(%rbp), %xmm1 + leaq -12(%r12,%r14), %rdx + cltq + movups %xmm1, (%rbx,%r14,8) + movups %xmm1, (%rbx,%rdx,8) + movdqa -2672(%rbp), %xmm2 + movq %r13, %rdx + movdqa -2960(%rbp), %xmm7 + movdqa -2624(%rbp), %xmm6 + subq %rax, %rdx + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + pshufd $78, %xmm7, %xmm4 + addq %rdx, %r14 + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -2640(%rbp) + call __popcountdi2@PLT + movdqa -2976(%rbp), %xmm7 + movdqa -2640(%rbp), %xmm1 + leaq -14(%r12,%r14), %rdx + movdqa -2624(%rbp), %xmm6 + movdqa -2672(%rbp), %xmm2 + cltq + movups %xmm1, (%rbx,%r14,8) + pshufd $78, %xmm7, %xmm4 + movups %xmm1, (%rbx,%rdx,8) + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + movq %r13, %rdx + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + subq %rax, %rdx + movaps %xmm2, -2688(%rbp) + addq %rdx, %r14 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -2640(%rbp) + call __popcountdi2@PLT + leaq -16(%r12,%r14), %rdx + movdqa -2640(%rbp), %xmm1 + movdqa -2688(%rbp), %xmm2 + cltq + subq %rax, %r13 + movups %xmm1, (%rbx,%r14,8) + leaq 0(%r13,%r14), %r12 + movups %xmm1, (%rbx,%rdx,8) + movq -2832(%rbp), %rdx + leaq 0(,%r12,8), %rax + movq %rax, -2672(%rbp) + subq %r12, %rdx +.L451: + movq -2832(%rbp), %rcx + cmpq $2, %rdx + movdqa -2624(%rbp), %xmm7 + leaq -16(,%rcx,8), %rax + cmovnb -2672(%rbp), %rax + movdqu (%rbx,%rax), %xmm6 + movups %xmm6, (%rbx,%rcx,8) + movaps %xmm6, -2640(%rbp) + movdqa -2800(%rbp), %xmm6 + movdqa %xmm6, %xmm0 + psubq %xmm6, %xmm2 + pcmpeqd %xmm7, %xmm0 + movdqa %xmm2, %xmm1 + pand %xmm0, %xmm1 + movdqa %xmm6, %xmm0 + pcmpgtd %xmm7, %xmm0 + por %xmm0, %xmm1 + pcmpeqd %xmm0, %xmm0 + pshufd $245, %xmm1, %xmm1 + pxor %xmm1, %xmm0 + movaps %xmm1, -2832(%rbp) + movmskpd %xmm0, %r13d + movq %r13, %rdi + salq $4, %r13 + call __popcountdi2@PLT + movq -2728(%rbp), %rcx + movdqa -2800(%rbp), %xmm6 + cltq + movdqa (%rcx,%r13), %xmm0 + movq %rax, -2640(%rbp) + movaps %xmm6, -80(%rbp) + movaps %xmm0, -2144(%rbp) + movzbl -2143(%rbp), %eax + movd %xmm0, %ecx + movaps %xmm0, -2224(%rbp) + andl $15, %ecx + movq %rax, %rdi + movzbl -2218(%rbp), %eax + movaps %xmm0, -2160(%rbp) + movzbl -2158(%rbp), %r10d + movaps %xmm0, -2240(%rbp) + andl $15, %edi + movaps %xmm0, -2256(%rbp) + movq %rax, %rsi + movq %r10, %r11 + movzbl -2233(%rbp), %eax + movzbl -2248(%rbp), %edx + andl $15, %r11d + andl $15, %esi + movaps %xmm0, -2176(%rbp) + andl $15, %eax + movaps %xmm0, -2192(%rbp) + movq %rsi, %r10 + movzbl -2188(%rbp), %r13d + andl $15, %edx + movaps %xmm0, -2208(%rbp) + movzbl -2203(%rbp), %r14d + movq %rcx, -2624(%rbp) + andl $15, %r13d + movq %rdi, -2688(%rbp) + andl $15, %r14d + movq %r11, -2704(%rbp) + movzbl -2173(%rbp), %r11d + movaps %xmm0, -2272(%rbp) + movq %rdx, -2720(%rbp) + movzbl -2263(%rbp), %edx + andl $15, %r11d + movaps %xmm0, -2288(%rbp) + movaps %xmm0, -2304(%rbp) + movq %rdx, %r8 + movzbl -2293(%rbp), %ecx + movzbl -2278(%rbp), %edx + movaps %xmm0, -2320(%rbp) + andl $15, %r8d + movzbl -2308(%rbp), %esi + movaps %xmm0, -2336(%rbp) + andl $15, %edx + andl $15, %ecx + movzbl -2323(%rbp), %edi + movaps %xmm0, -2352(%rbp) + andl $15, %esi + movaps %xmm0, -2368(%rbp) + movzbl -80(%rbp,%rax), %eax + movzbl -80(%rbp,%r10), %r10d + andl $15, %edi + movzbl -80(%rbp,%r14), %r14d + movzbl -80(%rbp,%r13), %r13d + movq %r8, -2656(%rbp) + salq $8, %rax + movzbl -80(%rbp,%r11), %r11d + movzbl -2353(%rbp), %r9d + orq %r10, %rax + movzbl -80(%rbp,%rdi), %edi + movzbl -80(%rbp,%rsi), %esi + salq $8, %rax + andl $15, %r9d + movzbl -80(%rbp,%rcx), %ecx + movzbl -2338(%rbp), %r8d + orq %r14, %rax + movq -2688(%rbp), %r14 + movzbl -80(%rbp,%r9), %r9d + salq $8, %rax + andl $15, %r8d + movzbl -80(%rbp,%rdx), %edx + orq %r13, %rax + movzbl -80(%rbp,%r8), %r8d + salq $8, %r9 + salq $8, %rax + orq %r11, %rax + movq -2704(%rbp), %r11 + salq $8, %rax + movzbl -80(%rbp,%r11), %r10d + movq -2624(%rbp), %r11 + orq %r10, %rax + movzbl -80(%rbp,%r14), %r10d + salq $8, %rax + orq %r10, %rax + movzbl -80(%rbp,%r11), %r10d + salq $8, %rax + orq %r8, %r9 + movq -2656(%rbp), %r8 + salq $8, %r9 + orq %r10, %rax + orq %rdi, %r9 + salq $8, %r9 + orq %rsi, %r9 + salq $8, %r9 + orq %rcx, %r9 + salq $8, %r9 + orq %rdx, %r9 + movzbl -80(%rbp,%r8), %edx + movq %rax, -2624(%rbp) + movdqa -2752(%rbp), %xmm6 + movdqa -2784(%rbp), %xmm0 + salq $8, %r9 + movq -2640(%rbp), %xmm2 + orq %rdx, %r9 + movq -2720(%rbp), %rdx + movdqa %xmm6, %xmm3 + salq $8, %r9 + movdqa -2832(%rbp), %xmm1 + punpcklqdq %xmm2, %xmm2 + movzbl -80(%rbp,%rdx), %edx + pcmpeqd %xmm2, %xmm3 + psubq %xmm2, %xmm0 + pcmpgtd %xmm6, %xmm2 + orq %rdx, %r9 + movq %r9, -2616(%rbp) + pand %xmm3, %xmm0 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L457 + movq -2672(%rbp), %rsi + movdqa -2624(%rbp), %xmm6 + movq %xmm6, (%rbx,%rsi) +.L457: + movhlps %xmm0, %xmm6 + movq %xmm6, %rax + testq %rax, %rax + je .L458 + movq -2672(%rbp), %rax + movdqa -2624(%rbp), %xmm6 + movhps %xmm6, 8(%rbx,%rax) +.L458: + addq -2640(%rbp), %r12 + movmskpd %xmm1, %r13d + movq %r13, %rdi + leaq 0(,%r12,8), %rax + salq $4, %r13 + movq %rax, -2656(%rbp) + call __popcountdi2@PLT + movdqa -2800(%rbp), %xmm6 + movl %eax, -2720(%rbp) + movq -2728(%rbp), %rax + movaps %xmm6, -64(%rbp) + movdqa (%rax,%r13), %xmm0 + movaps %xmm0, -2384(%rbp) + movzbl -2383(%rbp), %eax + movd %xmm0, %r10d + movaps %xmm0, -2496(%rbp) + movzbl -2488(%rbp), %edx + andl $15, %r10d + andl $15, %eax + movaps %xmm0, -2400(%rbp) + movaps %xmm0, -2512(%rbp) + movq %rdx, %rsi + movzbl -2503(%rbp), %edx + movq %rax, -2624(%rbp) + movzbl -2398(%rbp), %eax + andl $15, %esi + movaps %xmm0, -2464(%rbp) + andl $15, %edx + movq %rax, %rcx + movzbl -2458(%rbp), %eax + movaps %xmm0, -2528(%rbp) + movq %rdx, -2688(%rbp) + movzbl -2518(%rbp), %edx + andl $15, %ecx + movaps %xmm0, -2480(%rbp) + movq %rax, %rdi + movzbl -2473(%rbp), %eax + movq %rdx, %r9 + andl $15, %edi + movaps %xmm0, -2416(%rbp) + movzbl -2413(%rbp), %r11d + andl $15, %r9d + andl $15, %eax + movaps %xmm0, -2432(%rbp) + movq %rdi, %r8 + movaps %xmm0, -2448(%rbp) + movzbl -2428(%rbp), %r13d + andl $15, %r11d + movzbl -2443(%rbp), %r14d + movq %rcx, -2640(%rbp) + movq %rsi, -2672(%rbp) + andl $15, %r14d + andl $15, %r13d + movq %r9, -2704(%rbp) + movaps %xmm0, -2544(%rbp) + movzbl -2533(%rbp), %edx + movaps %xmm0, -2560(%rbp) + movzbl -2548(%rbp), %ecx + movaps %xmm0, -2576(%rbp) + movzbl -2563(%rbp), %esi + andl $15, %edx + movaps %xmm0, -2592(%rbp) + movzbl -2578(%rbp), %edi + andl $15, %ecx + movaps %xmm0, -2608(%rbp) + movzbl -64(%rbp,%rax), %eax + andl $15, %esi + movzbl -64(%rbp,%r8), %r8d + movzbl -64(%rbp,%r14), %r14d + andl $15, %edi + movzbl -64(%rbp,%rsi), %esi + salq $8, %rax + movzbl -64(%rbp,%r13), %r13d + movzbl -64(%rbp,%r11), %r11d + orq %r8, %rax + movzbl -64(%rbp,%rdi), %edi + movzbl -64(%rbp,%rcx), %ecx + salq $8, %rax + movzbl -2593(%rbp), %r9d + movzbl -64(%rbp,%rdx), %edx + orq %r14, %rax + movq -2640(%rbp), %r14 + movzbl -64(%rbp,%r10), %r10d + salq $8, %rax + andl $15, %r9d + orq %r13, %rax + movzbl -64(%rbp,%r9), %r9d + salq $8, %rax + orq %r11, %rax + movzbl -64(%rbp,%r14), %r11d + salq $8, %rax + orq %r11, %rax + movq -2624(%rbp), %r11 + salq $8, %rax + movzbl -64(%rbp,%r11), %r11d + orq %r11, %rax + salq $8, %rax + salq $8, %r9 + orq %rdi, %r9 + orq %r10, %rax + salq $8, %r9 + movq %rax, -2624(%rbp) + orq %rsi, %r9 + movq -2672(%rbp), %rsi + salq $8, %r9 + orq %rcx, %r9 + movq -2704(%rbp), %rcx + salq $8, %r9 + orq %rdx, %r9 + movzbl -64(%rbp,%rcx), %edx + salq $8, %r9 + orq %rdx, %r9 + movq -2688(%rbp), %rdx + salq $8, %r9 + movzbl -64(%rbp,%rdx), %edx + orq %rdx, %r9 + movzbl -64(%rbp,%rsi), %edx + salq $8, %r9 + orq %rdx, %r9 + movq %r9, -2616(%rbp) + movslq -2720(%rbp), %rax + movdqa -2752(%rbp), %xmm6 + movdqa -2784(%rbp), %xmm0 + movq %rax, %xmm1 + punpcklqdq %xmm1, %xmm1 + movdqa %xmm6, %xmm2 + pcmpeqd %xmm1, %xmm2 + psubq %xmm1, %xmm0 + pcmpgtd %xmm6, %xmm1 + pand %xmm2, %xmm0 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L459 + movdqa -2624(%rbp), %xmm6 + movq %xmm6, (%rbx,%r12,8) +.L459: + movhlps %xmm0, %xmm6 + movq %xmm6, %rax + testq %rax, %rax + je .L460 + movq -2656(%rbp), %rax + movdqa -2624(%rbp), %xmm6 + movhps %xmm6, 8(%rbx,%rax) +.L460: + movq -2848(%rbp), %rbx + addq %r12, %rbx + movq -2768(%rbp), %r12 + subq $1, %r12 + cmpl $2, -2812(%rbp) + je .L462 + movq -2736(%rbp), %r8 + movq %r12, %r9 + movq %r15, %rcx + movq %rbx, %rdx + movq -2808(%rbp), %rsi + movq -2648(%rbp), %rdi + call _ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + cmpl $3, -2812(%rbp) + je .L369 +.L462: + movq -2760(%rbp), %rdx + movq %r12, %r9 + movq %r15, %rcx + movq -2648(%rbp), %rax + movq -2736(%rbp), %r8 + movq -2808(%rbp), %rsi + subq %rbx, %rdx + leaq (%rax,%rbx,8), %rdi + call _ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L369: + addq $2952, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L443: + .cfi_restore_state + movq (%rax), %rdx + leaq 8(%rbx), %rdi + movq %rax, %rsi + andq $-8, %rdi + movq %rdx, (%rbx) + movl %r13d, %edx + movq -8(%rax,%rdx), %rcx + movq %rcx, -8(%rbx,%rdx) + movq %rbx, %rcx + subq %rdi, %rcx + subq %rcx, %rsi + addl %r13d, %ecx + shrl $3, %ecx + rep movsq + cmpl $8, %r13d + jb .L601 +.L447: + movq (%r15), %rdx + leaq 8(%rax), %rdi + movq %r15, %rsi + andq $-8, %rdi + movq %rdx, (%rax) + movl %r13d, %edx + movq -8(%r15,%rdx), %rcx + movq %rcx, -8(%rax,%rdx) + subq %rdi, %rax + leal 0(%r13,%rax), %ecx + subq %rax, %rsi + shrl $3, %ecx + rep movsq + jmp .L448 +.L598: + movzbl (%r15), %edx + movb %dl, (%rax) + jmp .L448 +.L597: + movzbl (%rax), %edx + movb %dl, (%rbx) + jmp .L444 +.L594: + cmpq $1, %rdx + jbe .L369 + leaq 256(%rdi), %rax + cmpq %rax, %rsi + jb .L373 + movl $2, %esi + call _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + jmp .L369 +.L384: + movq -2648(%rbp), %rax + andl $1, %r12d + movdqa %xmm2, %xmm5 + movl $2, %edi + movdqa .LC1(%rip), %xmm7 + subq %r12, %rdi + movdqu (%rax), %xmm6 + movq %rdi, %xmm3 + punpcklqdq %xmm3, %xmm3 + movaps %xmm7, -2752(%rbp) + movaps %xmm6, -2624(%rbp) + movdqa .LC0(%rip), %xmm6 + movdqa -2624(%rbp), %xmm0 + movdqa %xmm6, %xmm1 + movaps %xmm6, -2784(%rbp) + movdqa %xmm7, %xmm6 + pcmpeqd %xmm3, %xmm6 + pcmpeqd %xmm2, %xmm0 + psubq %xmm3, %xmm1 + pcmpgtd %xmm7, %xmm3 + pand %xmm6, %xmm1 + pshufd $177, %xmm0, %xmm4 + por %xmm3, %xmm1 + pand %xmm4, %xmm0 + pshufd $245, %xmm1, %xmm1 + pandn %xmm1, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L602 + movq -2648(%rbp), %rax + pxor %xmm1, %xmm1 + movq -2760(%rbp), %r8 + pxor %xmm6, %xmm6 + movdqa %xmm1, %xmm0 + leaq 256(%rax,%rdi,8), %rsi + .p2align 4,,10 + .p2align 3 +.L390: + movq %rdi, %rcx + leaq 32(%rdi), %rdi + cmpq %rdi, %r8 + jb .L603 + leaq -256(%rsi), %rax +.L389: + movdqa (%rax), %xmm3 + leaq 32(%rax), %rdx + pxor %xmm2, %xmm3 + por %xmm3, %xmm0 + movdqa 16(%rax), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm1 + movdqa 32(%rax), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm0 + movdqa 48(%rax), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm1 + movdqa 64(%rax), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm0 + movdqa 80(%rax), %xmm3 + leaq 96(%rdx), %rax + pxor %xmm2, %xmm3 + por %xmm3, %xmm1 + movdqa 64(%rdx), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm0 + movdqa 80(%rdx), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm1 + cmpq %rsi, %rax + jne .L389 + movdqa %xmm0, %xmm3 + leaq 352(%rdx), %rsi + por %xmm1, %xmm3 + pcmpeqd %xmm6, %xmm3 + pshufd $177, %xmm3, %xmm4 + pand %xmm4, %xmm3 + movmskpd %xmm3, %eax + cmpl $3, %eax + je .L390 + movq -2648(%rbp), %rax + movdqa %xmm5, %xmm0 + pcmpeqd %xmm3, %xmm3 + movq -2648(%rbp), %rdx + pcmpeqd (%rax,%rcx,8), %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L392 + .p2align 4,,10 + .p2align 3 +.L391: + addq $2, %rcx + movdqa %xmm5, %xmm0 + pcmpeqd (%rdx,%rcx,8), %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + je .L391 +.L392: + rep bsfl %eax, %eax + cltq + addq %rcx, %rax +.L388: + movq -2648(%rbp), %rcx + movdqa %xmm5, %xmm3 + movdqa %xmm2, %xmm0 + leaq (%rcx,%rax,8), %rdi + movq (%rdi), %rbx + movq %rbx, %xmm1 + punpcklqdq %xmm1, %xmm1 + pcmpeqd %xmm1, %xmm3 + psubq %xmm1, %xmm0 + movaps %xmm1, -2640(%rbp) + pand %xmm3, %xmm0 + movdqa %xmm1, %xmm3 + pcmpgtd %xmm5, %xmm3 + por %xmm3, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %edx + testl %edx, %edx + jne .L397 + movq -2760(%rbp), %rax + movq %r14, -2720(%rbp) + xorl %r13d, %r13d + movq %rbx, -2688(%rbp) + movq %rcx, %rbx + leaq -2(%rax), %r12 + movaps %xmm5, -2624(%rbp) + movq %rax, %r14 + movaps %xmm1, -2704(%rbp) + movaps %xmm2, -2672(%rbp) + jmp .L404 + .p2align 4,,10 + .p2align 3 +.L398: + movdqa -2672(%rbp), %xmm6 + movmskpd %xmm0, %edi + movups %xmm6, (%rbx,%r12,8) + call __popcountdi2@PLT + cltq + addq %rax, %r13 + leaq -2(%r12), %rax + cmpq %rax, %r14 + jbe .L604 + movq %rax, %r12 +.L404: + movdqu (%rbx,%r12,8), %xmm0 + pcmpeqd -2640(%rbp), %xmm0 + pshufd $177, %xmm0, %xmm1 + movdqa %xmm0, %xmm3 + movdqu (%rbx,%r12,8), %xmm0 + pand %xmm1, %xmm3 + pcmpeqd -2624(%rbp), %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm1, %xmm0 + movdqa %xmm3, %xmm1 + por %xmm0, %xmm1 + movmskpd %xmm1, %eax + cmpl $3, %eax + je .L398 + pcmpeqd %xmm4, %xmm4 + movq -2648(%rbp), %rcx + leaq 2(%r12), %rdx + movdqa -2624(%rbp), %xmm5 + pxor %xmm4, %xmm0 + movq -2688(%rbp), %rbx + movdqa -2704(%rbp), %xmm1 + pandn %xmm0, %xmm3 + movdqa -2672(%rbp), %xmm2 + movmskpd %xmm3, %eax + rep bsfl %eax, %eax + cltq + addq %r12, %rax + addq $4, %r12 + movq (%rcx,%rax,8), %xmm3 + movq -2760(%rbp), %rax + punpcklqdq %xmm3, %xmm3 + subq %r13, %rax + movaps %xmm3, -64(%rbp) + cmpq %r12, %rax + jb .L399 + .p2align 4,,10 + .p2align 3 +.L400: + movups %xmm1, -16(%rcx,%r12,8) + movq %r12, %rdx + addq $2, %r12 + cmpq %rax, %r12 + jbe .L400 +.L399: + movdqa -2752(%rbp), %xmm7 + subq %rdx, %rax + movdqa -2784(%rbp), %xmm4 + leaq 0(,%rdx,8), %rcx + movq %rax, %xmm0 + punpcklqdq %xmm0, %xmm0 + movdqa %xmm7, %xmm6 + pcmpeqd %xmm0, %xmm6 + psubq %xmm0, %xmm4 + pcmpgtd %xmm7, %xmm0 + pand %xmm6, %xmm4 + por %xmm4, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L401 + movq -2648(%rbp), %rax + movq %rbx, (%rax,%rdx,8) +.L401: + movhlps %xmm0, %xmm6 + movq %xmm6, %rax + testq %rax, %rax + je .L403 + movq -2648(%rbp), %rax + movq %rbx, 8(%rax,%rcx) +.L403: + movdqa %xmm5, %xmm0 + pcmpeqd .LC5(%rip), %xmm0 + pshufd $177, %xmm0, %xmm4 + pand %xmm4, %xmm0 + movmskpd %xmm0, %eax + cmpl $3, %eax + je .L480 + movdqa %xmm5, %xmm0 + pcmpeqd .LC6(%rip), %xmm0 + pshufd $177, %xmm0, %xmm4 + pand %xmm4, %xmm0 + movmskpd %xmm0, %eax + movl %eax, -2812(%rbp) + cmpl $3, %eax + je .L605 + movdqa %xmm1, %xmm4 + movdqa %xmm1, %xmm0 + movdqa %xmm1, %xmm6 + pcmpeqd %xmm3, %xmm4 + psubq %xmm3, %xmm0 + pand %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pcmpgtd %xmm1, %xmm4 + por %xmm4, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm4 + pand %xmm0, %xmm6 + pandn %xmm3, %xmm4 + por %xmm4, %xmm6 + movdqa %xmm6, %xmm7 + movdqa %xmm6, %xmm4 + pcmpeqd %xmm5, %xmm7 + psubq %xmm2, %xmm4 + pand %xmm7, %xmm4 + movdqa %xmm5, %xmm7 + pcmpgtd %xmm6, %xmm7 + por %xmm7, %xmm4 + pshufd $245, %xmm4, %xmm4 + movmskpd %xmm4, %eax + testl %eax, %eax + jne .L606 + movdqa %xmm2, %xmm0 + movl $32, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L414: + movq -2648(%rbp), %rbx + leaq (%rcx,%rax,2), %rdx + movdqa %xmm0, %xmm1 + addq $1, %rax + movdqu (%rbx,%rdx,8), %xmm3 + movdqa %xmm3, %xmm4 + psubq %xmm3, %xmm1 + pcmpeqd %xmm0, %xmm4 + pand %xmm4, %xmm1 + movdqa %xmm3, %xmm4 + pcmpgtd %xmm0, %xmm4 + por %xmm4, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm0, %xmm1 + pandn %xmm3, %xmm4 + por %xmm4, %xmm1 + movdqa %xmm1, %xmm0 + cmpq $16, %rax + jne .L414 + movdqa %xmm1, %xmm3 + psubq %xmm2, %xmm1 + pcmpeqd %xmm5, %xmm3 + pand %xmm3, %xmm1 + movdqa %xmm5, %xmm3 + pcmpgtd %xmm0, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movmskpd %xmm1, %eax + testl %eax, %eax + jne .L593 + leaq 32(%rsi), %rax + cmpq %rax, -2760(%rbp) + jb .L607 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L414 +.L492: + movdqa .LC0(%rip), %xmm6 + leaq _ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %rcx + movq %rsi, %rbx + xorl %r13d, %r13d + movq $0, -2640(%rbp) + movaps %xmm6, -2784(%rbp) + movdqa .LC1(%rip), %xmm6 + movq %rcx, -2728(%rbp) + movaps %xmm6, -2752(%rbp) + jmp .L432 +.L494: + movq %r12, %r14 + movq %rbx, %rdx + movl $2, %r13d + jmp .L452 +.L493: + movq $0, -2672(%rbp) + movq %rcx, %rdx + jmp .L451 +.L596: + movq -2760(%rbp), %rsi + movq -2648(%rbp), %rdi + leaq -1(%rsi), %rbx + movq %rbx, %r12 + shrq %r12 + .p2align 4,,10 + .p2align 3 +.L430: + movq %r12, %rdx + call _ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + subq $1, %r12 + jnb .L430 + .p2align 4,,10 + .p2align 3 +.L431: + movq (%rdi,%rbx,8), %rdx + movq (%rdi), %rax + movq %rbx, %rsi + movq %rdx, (%rdi) + xorl %edx, %edx + movq %rax, (%rdi,%rbx,8) + call _ZN3hwy6N_SSE26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + subq $1, %rbx + jne .L431 + jmp .L369 +.L482: + movl $12, %edx + movl $11, %ebx + jmp .L428 +.L483: + movl $9, %ebx + movl $10, %edx + jmp .L428 +.L484: + movl $9, %edx + jmp .L428 +.L485: + movl $8, %edx + movl $7, %ebx + jmp .L428 +.L486: + movl $6, %ebx + movl $7, %edx + jmp .L428 +.L603: + movq -2648(%rbp), %rdi + movq -2760(%rbp), %rsi + pcmpeqd %xmm3, %xmm3 + .p2align 4,,10 + .p2align 3 +.L394: + movq %rcx, %rdx + addq $2, %rcx + cmpq %rcx, %rsi + jb .L608 + movdqa %xmm5, %xmm0 + pcmpeqd -16(%rdi,%rcx,8), %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + je .L394 +.L591: + rep bsfl %eax, %eax + cltq + addq %rdx, %rax + jmp .L388 +.L487: + movl $5, %ebx + movl $6, %edx + jmp .L428 +.L488: + movl $4, %ebx + movl $5, %edx + jmp .L428 +.L397: + movq -2760(%rbp), %rsi + leaq -64(%rbp), %rdx + movq %r15, %rcx + movdqa %xmm2, %xmm0 + movaps %xmm1, -2624(%rbp) + subq %rax, %rsi + call _ZN3hwy6N_SSE26detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L369 + movq (%r15), %xmm2 + movdqa -64(%rbp), %xmm3 + movdqa -2624(%rbp), %xmm1 + punpcklqdq %xmm2, %xmm2 + movdqa %xmm2, %xmm5 + jmp .L403 +.L489: + movl $3, %ebx + movl $4, %edx + jmp .L428 +.L490: + movl $2, %ebx + movl $3, %edx + jmp .L428 +.L491: + movl $1, %ebx + movl $2, %edx + jmp .L428 +.L604: + movdqa -2752(%rbp), %xmm6 + movq -2648(%rbp), %rax + movq %r12, %xmm0 + movdqa -2784(%rbp), %xmm3 + punpcklqdq %xmm0, %xmm0 + movdqa -2624(%rbp), %xmm5 + movdqa %xmm6, %xmm4 + movdqa -2704(%rbp), %xmm1 + movq -2760(%rbp), %rcx + pcmpeqd %xmm0, %xmm4 + psubq %xmm0, %xmm3 + movq -2720(%rbp), %r14 + movq -2688(%rbp), %rbx + pcmpgtd %xmm6, %xmm0 + movdqu (%rax), %xmm6 + movdqa -2672(%rbp), %xmm2 + subq %r13, %rcx + movaps %xmm6, -2624(%rbp) + movdqa -2624(%rbp), %xmm6 + pand %xmm4, %xmm3 + por %xmm0, %xmm3 + movdqa -2624(%rbp), %xmm0 + pcmpeqd %xmm5, %xmm6 + pshufd $245, %xmm3, %xmm3 + pcmpeqd %xmm1, %xmm0 + pshufd $177, %xmm6, %xmm4 + pand %xmm3, %xmm4 + pshufd $177, %xmm0, %xmm7 + pand %xmm6, %xmm4 + pcmpeqd %xmm6, %xmm6 + pand %xmm7, %xmm0 + pxor %xmm6, %xmm3 + por %xmm3, %xmm0 + por %xmm4, %xmm0 + movmskpd %xmm0, %eax + cmpl $3, %eax + jne .L609 + movmskpd %xmm4, %edi + movaps %xmm2, -2640(%rbp) + movaps %xmm1, -2624(%rbp) + movq %rcx, -2672(%rbp) + call __popcountdi2@PLT + movq -2648(%rbp), %rbx + movdqa -2640(%rbp), %xmm2 + movslq %eax, %rdx + movq -2672(%rbp), %rax + movdqa -2624(%rbp), %xmm1 + movups %xmm2, (%rbx) + subq %rdx, %rax + cmpq $1, %rax + jbe .L469 + leaq -2(%rax), %rcx + movq %rcx, %rdx + shrq %rdx + salq $4, %rdx + leaq 16(%rbx,%rdx), %rdx +.L411: + movups %xmm1, (%r14) + addq $16, %r14 + cmpq %rdx, %r14 + jne .L411 + andq $-2, %rcx + movq %rcx, %rdx + addq $2, %rdx + leaq 0(,%rdx,8), %rcx + subq %rdx, %rax +.L410: + movaps %xmm1, (%r15) + testq %rax, %rax + je .L369 + movq -2648(%rbp), %rdi + leaq 0(,%rax,8), %rdx + movq %r15, %rsi + addq %rcx, %rdi + call memcpy@PLT + jmp .L369 +.L607: + movq -2760(%rbp), %rcx + movq %rbx, %rdx + jmp .L421 +.L422: + movdqu -16(%rdx,%rsi,8), %xmm1 + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm0 + pcmpeqd %xmm5, %xmm3 + psubq %xmm2, %xmm0 + pand %xmm3, %xmm0 + movdqa %xmm5, %xmm3 + pcmpgtd %xmm1, %xmm3 + por %xmm3, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L593 +.L421: + movq %rsi, %rax + addq $2, %rsi + cmpq %rsi, %rcx + jnb .L422 + movq -2760(%rbp), %rbx + cmpq %rax, %rbx + je .L480 + movq -2648(%rbp), %rax + movdqu -16(%rax,%rbx,8), %xmm1 + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm0 + pcmpeqd %xmm5, %xmm3 + pcmpgtd %xmm1, %xmm5 + psubq %xmm2, %xmm0 + pand %xmm3, %xmm0 + por %xmm5, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %eax + cmpl $1, %eax + movl $1, %eax + adcl $0, %eax + movl %eax, -2812(%rbp) + jmp .L423 +.L425: + movl $11, %edx + movl $10, %ebx + jmp .L428 +.L608: + movq -2760(%rbp), %rax + leaq -2(%rax), %rdx + movq -2648(%rbp), %rax + movdqu (%rax,%rdx,8), %xmm6 + movaps %xmm6, -2624(%rbp) + movdqa -2624(%rbp), %xmm0 + pcmpeqd %xmm5, %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm1, %xmm0 + pcmpeqd %xmm1, %xmm1 + pxor %xmm1, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + je .L369 + jmp .L591 +.L602: + rep bsfl %eax, %eax + cltq + jmp .L388 +.L373: + movq %rdx, %r11 + leaq -2(%rdx), %rdx + movq -2648(%rbp), %r10 + leaq 8(%rcx), %rdi + movq %rdx, %rbx + andq $-8, %rdi + andq $-2, %rdx + shrq %rbx + movq (%r10), %rax + movq %r10, %rsi + leaq 2(%rdx), %r12 + addq $1, %rbx + salq $4, %rbx + movq %rax, (%rcx) + movl %ebx, %r8d + leaq 8(%r10,%r8), %r14 + leaq 8(%rcx,%r8), %r13 + movq -16(%r14), %rax + movq %rax, -16(%r13) + movq %rcx, %rax + subq %rdi, %rax + subq %rax, %rsi + leal (%rbx,%rax), %ecx + movq %r11, %rax + shrl $3, %ecx + subq %r12, %rax + rep movsq + movq %r11, -2760(%rbp) + movq %rax, %rcx + movq %rax, -2624(%rbp) + je .L374 + leaq 0(,%r12,8), %rax + leaq 0(,%rcx,8), %rdx + movq %r8, -2640(%rbp) + leaq (%r10,%rax), %rsi + leaq (%r15,%rax), %rdi + call memcpy@PLT + movq -2760(%rbp), %r11 + movl $32, %ecx + movq -2640(%rbp), %r8 + movl %r11d, %eax + subl $1, %eax + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rax + shrq $4, %rax + movq %rax, %rsi + movl $1, %eax + cmove %rax, %rsi + movq %rsi, %rax + salq $4, %rax + addq $2, %rax + cmpq %rax, %r11 + jnb .L610 +.L375: + movdqa .LC4(%rip), %xmm0 + movq -2760(%rbp), %rdx +.L379: + movups %xmm0, (%r15,%rdx,8) + addq $2, %rdx + cmpq %rax, %rdx + jb .L379 + movq %r15, %rdi + movq %r8, -2640(%rbp) + call _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq -2648(%rbp), %rcx + movq (%r15), %rax + movq %r15, %rsi + movq %rax, (%rcx) + movq -2640(%rbp), %r8 + leaq 8(%rcx), %rdi + andq $-8, %rdi + movq -8(%r15,%r8), %rax + movq %rax, -8(%rcx,%r8) + movq %rcx, %rax + subq %rdi, %rax + leal (%rbx,%rax), %ecx + subq %rax, %rsi + shrl $3, %ecx + rep movsq + cmpq $0, -2624(%rbp) + je .L369 +.L377: + movq -2648(%rbp), %rdi + movq -2624(%rbp), %rdx + salq $3, %r12 + leaq (%r15,%r12), %rsi + addq %r12, %rdi + salq $3, %rdx + call memcpy@PLT + jmp .L369 +.L480: + movl $2, -2812(%rbp) + jmp .L423 +.L605: + pcmpeqd %xmm0, %xmm0 + paddq %xmm0, %xmm2 + jmp .L423 +.L606: + movdqa %xmm0, %xmm4 + pand %xmm3, %xmm0 + pandn %xmm1, %xmm4 + movdqa %xmm2, %xmm1 + por %xmm4, %xmm0 + movdqa %xmm0, %xmm3 + psubq %xmm0, %xmm1 + pcmpeqd %xmm5, %xmm3 + pcmpgtd %xmm5, %xmm0 + pand %xmm3, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L593 + movdqa %xmm2, %xmm0 + movl $32, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L415: + movq -2648(%rbp), %rbx + leaq (%rcx,%rax,2), %rdx + movdqa %xmm0, %xmm1 + addq $1, %rax + movdqu (%rbx,%rdx,8), %xmm3 + movdqa %xmm3, %xmm4 + psubq %xmm3, %xmm1 + pcmpeqd %xmm0, %xmm4 + pand %xmm4, %xmm1 + movdqa %xmm3, %xmm4 + pcmpgtd %xmm0, %xmm4 + por %xmm4, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm3 + pandn %xmm0, %xmm4 + por %xmm4, %xmm3 + movdqa %xmm3, %xmm0 + cmpq $16, %rax + jne .L415 + pcmpeqd %xmm5, %xmm3 + movdqa %xmm2, %xmm1 + psubq %xmm0, %xmm1 + pand %xmm3, %xmm1 + movdqa %xmm0, %xmm3 + pcmpgtd %xmm5, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movmskpd %xmm1, %eax + testl %eax, %eax + jne .L593 + leaq 32(%rsi), %rax + cmpq %rax, -2760(%rbp) + jb .L611 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L415 +.L374: + movq -2760(%rbp), %rdi + movl $32, %ecx + movl %edi, %eax + subl $1, %eax + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rax + shrq $4, %rax + movq %rax, %rsi + movl $1, %eax + cmove %rax, %rsi + movq %rsi, %rax + salq $4, %rax + addq $2, %rax + cmpq %rax, %rdi + jb .L375 + movq %r15, %rdi + call _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq -2648(%rbp), %rcx + movq (%r15), %rax + movq %r15, %rsi + movq %rax, (%rcx) + movq -16(%r13), %rax + leaq 8(%rcx), %rdi + andq $-8, %rdi + movq %rax, -16(%r14) + movq %rcx, %rax + subq %rdi, %rax + leal (%rbx,%rax), %ecx + subq %rax, %rsi + shrl $3, %ecx + rep movsq + jmp .L369 + .p2align 4,,10 + .p2align 3 +.L469: + xorl %ecx, %ecx + jmp .L410 +.L609: + pxor %xmm6, %xmm0 + movq -2648(%rbp), %rsi + movmskpd %xmm0, %eax + rep bsfl %eax, %eax + cltq + movq (%rsi,%rax,8), %xmm3 + leaq 2(%r12), %rax + punpcklqdq %xmm3, %xmm3 + movaps %xmm3, -64(%rbp) + cmpq %rcx, %rax + ja .L406 +.L407: + movq -2648(%rbp), %rsi + movq %rax, %r12 + movups %xmm1, -16(%rsi,%rax,8) + addq $2, %rax + cmpq %rcx, %rax + jbe .L407 +.L406: + movq %rcx, %rax + movdqa -2752(%rbp), %xmm7 + movdqa -2784(%rbp), %xmm0 + leaq 0(,%r12,8), %rdx + subq %r12, %rax + movq %rax, %xmm4 + movdqa %xmm7, %xmm6 + punpcklqdq %xmm4, %xmm4 + pcmpeqd %xmm4, %xmm6 + psubq %xmm4, %xmm0 + pcmpgtd %xmm7, %xmm4 + pand %xmm6, %xmm0 + por %xmm4, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L408 + movq -2648(%rbp), %rax + movq %rbx, (%rax,%r12,8) +.L408: + movhlps %xmm0, %xmm6 + movq %xmm6, %rax + testq %rax, %rax + je .L403 + movq -2648(%rbp), %rax + movq %rbx, 8(%rax,%rdx) + jmp .L403 +.L611: + movq %rbx, %rdx + jmp .L417 +.L418: + movdqu -16(%rdx,%rsi,8), %xmm1 + movdqa %xmm2, %xmm0 + movdqa %xmm1, %xmm3 + psubq %xmm1, %xmm0 + pcmpeqd %xmm5, %xmm3 + pcmpgtd %xmm5, %xmm1 + pand %xmm3, %xmm0 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L593 +.L417: + movq %rsi, %rax + addq $2, %rsi + cmpq %rsi, -2760(%rbp) + jnb .L418 + movq -2760(%rbp), %rbx + cmpq %rax, %rbx + je .L419 + movq -2648(%rbp), %rax + movdqa %xmm2, %xmm0 + movdqu -16(%rax,%rbx,8), %xmm1 + movdqa %xmm1, %xmm3 + psubq %xmm1, %xmm0 + pcmpeqd %xmm5, %xmm3 + pcmpgtd %xmm5, %xmm1 + pand %xmm3, %xmm0 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L593 +.L419: + movl $3, -2812(%rbp) + pcmpeqd %xmm0, %xmm0 + paddq %xmm0, %xmm2 + jmp .L423 +.L610: + movq %r15, %rdi + call _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq -2648(%rbp), %rcx + movq (%r15), %rax + movq %r15, %rsi + movq %rax, (%rcx) + movq -16(%r13), %rax + leaq 8(%rcx), %rdi + andq $-8, %rdi + movq %rax, -16(%r14) + movq %rcx, %rax + subq %rdi, %rax + leal (%rbx,%rax), %ecx + subq %rax, %rsi + shrl $3, %ecx + rep movsq + jmp .L377 + .cfi_endproc +.LFE18797: + .size _ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0, .-_ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + .section .text._ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0, @function +_ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0: +.LFB18799: + .cfi_startproc + leaq 8(%rsp), %r10 + .cfi_def_cfa 10, 0 + andq $-64, %rsp + pushq -8(%r10) + pushq %rbp + movq %rsp, %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + pushq %r15 + .cfi_escape 0x10,0xf,0x2,0x76,0x78 + movq %rdx, %r15 + pushq %r14 + pushq %r13 + .cfi_escape 0x10,0xe,0x2,0x76,0x70 + .cfi_escape 0x10,0xd,0x2,0x76,0x68 + movq %rdi, %r13 + pushq %r12 + .cfi_escape 0x10,0xc,0x2,0x76,0x60 + movq %rcx, %r12 + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x58,0x6 + pushq %rbx + addq $-128, %rsp + .cfi_escape 0x10,0x3,0x2,0x76,0x50 + movq %rsi, -136(%rbp) + movq %r9, -128(%rbp) + cmpq $128, %rdx + jbe .L786 + movq %rdi, %r11 + movq %rdi, -152(%rbp) + movq %r8, %rbx + shrq $3, %r11 + movq %r11, %rdi + andl $7, %edi + movq %rdi, -144(%rbp) + jne .L787 + movq %rdx, -120(%rbp) + movq %r13, %r11 +.L625: + movq 8(%rbx), %rdx + movq 16(%rbx), %r9 + movq %rdx, %rsi + leaq 1(%r9), %rdi + leaq (%rdx,%rdx,8), %rcx + xorq (%rbx), %rdi + shrq $11, %rsi + rorx $40, %rdx, %rax + leaq 2(%r9), %rdx + addq %rdi, %rax + xorq %rsi, %rcx + movq %rax, %r8 + rorx $40, %rax, %rsi + xorq %rdx, %rcx + shrq $11, %r8 + leaq (%rax,%rax,8), %rdx + leaq 3(%r9), %rax + addq %rcx, %rsi + xorq %r8, %rdx + movq %rsi, %r8 + xorq %rax, %rdx + leaq (%rsi,%rsi,8), %rax + rorx $40, %rsi, %r10 + shrq $11, %r8 + addq %rdx, %r10 + leaq 4(%r9), %rsi + addq $5, %r9 + xorq %r8, %rax + rorx $40, %r10, %r8 + movq %r9, 16(%rbx) + xorq %rsi, %rax + movq %r10, %rsi + shrq $11, %rsi + addq %rax, %r8 + movq %rsi, %r14 + leaq (%r10,%r10,8), %rsi + leaq (%r8,%r8,8), %r10 + xorq %r14, %rsi + movq %r8, %r14 + rorx $40, %r8, %r8 + shrq $11, %r14 + xorq %r9, %rsi + movabsq $34359738359, %r9 + xorq %r14, %r10 + addq %rsi, %r8 + movl %esi, %esi + vmovq %r10, %xmm6 + movq -120(%rbp), %r10 + vpinsrq $1, %r8, %xmm6, %xmm0 + movq %r10, %r14 + vmovdqu %xmm0, (%rbx) + shrq $3, %r14 + cmpq %r9, %r10 + movl $4294967295, %r9d + movq %r14, %r8 + leaq 192(%r12), %r14 + cmova %r9, %r8 + movl %edi, %r9d + shrq $32, %rdi + imulq %r8, %r9 + imulq %r8, %rdi + imulq %r8, %rsi + shrq $32, %r9 + salq $6, %r9 + shrq $32, %rdi + vmovdqa64 (%r11,%r9), %zmm2 + movl %ecx, %r9d + shrq $32, %rcx + imulq %r8, %r9 + salq $6, %rdi + shrq $32, %rsi + imulq %r8, %rcx + salq $6, %rsi + vmovdqa64 (%r11,%rsi), %zmm5 + shrq $32, %r9 + salq $6, %r9 + shrq $32, %rcx + vmovdqa64 (%r11,%r9), %zmm3 + salq $6, %rcx + vpminsq %zmm3, %zmm2, %zmm0 + vpmaxsq (%r11,%rdi), %zmm0, %zmm0 + vpmaxsq %zmm3, %zmm2, %zmm2 + vpminsq %zmm2, %zmm0, %zmm0 + vmovdqa64 (%r11,%rcx), %zmm2 + movq %rdx, %rcx + movl %edx, %edx + shrq $32, %rcx + imulq %r8, %rdx + vmovdqa64 %zmm0, (%r12) + imulq %r8, %rcx + shrq $32, %rdx + shrq $32, %rcx + salq $6, %rdx + salq $6, %rcx + vmovdqa64 (%r11,%rcx), %zmm4 + vpminsq %zmm4, %zmm2, %zmm3 + vpmaxsq (%r11,%rdx), %zmm3, %zmm3 + movl %eax, %edx + shrq $32, %rax + imulq %r8, %rdx + vpmaxsq %zmm4, %zmm2, %zmm2 + imulq %r8, %rax + vpminsq %zmm2, %zmm3, %zmm3 + vmovdqa64 %zmm3, 64(%r12) + shrq $32, %rdx + salq $6, %rdx + shrq $32, %rax + vmovdqa64 (%r11,%rdx), %zmm4 + salq $6, %rax + vpminsq %zmm5, %zmm4, %zmm2 + vpmaxsq (%r11,%rax), %zmm2, %zmm1 + vpmaxsq %zmm5, %zmm4, %zmm4 + vpbroadcastq %xmm0, %zmm5 + vpminsq %zmm4, %zmm1, %zmm1 + vpxord %zmm0, %zmm5, %zmm0 + vpxord %zmm3, %zmm5, %zmm3 + vmovdqa64 %zmm1, 128(%r12) + vpord %zmm3, %zmm0, %zmm0 + vpxord %zmm1, %zmm5, %zmm1 + vmovdqa32 %zmm5, %zmm2 + vpord %zmm1, %zmm0, %zmm0 + vptestnmq %zmm0, %zmm0, %k0 + kortestb %k0, %k0 + jc .L627 + vpbroadcastq .LC10(%rip), %zmm0 + movl $2, %esi + movq %r12, %rdi + vmovdqu64 %zmm0, 192(%r12) + vmovdqu64 %zmm0, 256(%r12) + vzeroupper + call _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + vpbroadcastq (%r12), %zmm2 + vpbroadcastq 184(%r12), %zmm1 + vpternlogd $0xFF, %zmm0, %zmm0, %zmm0 + vpaddq %zmm0, %zmm1, %zmm0 + vpcmpq $0, %zmm0, %zmm2, %k0 + kortestb %k0, %k0 + jnc .L629 + leaq -112(%rbp), %rdx + movq %r14, %rcx + vmovdqa64 %zmm2, %zmm0 + movq %r15, %rsi + movq %r13, %rdi + call _ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L781 +.L629: + movq 96(%r12), %rdx + cmpq %rdx, 88(%r12) + jne .L727 + cmpq 80(%r12), %rdx + jne .L670 + cmpq 72(%r12), %rdx + jne .L728 + cmpq 64(%r12), %rdx + jne .L729 + cmpq 56(%r12), %rdx + jne .L730 + cmpq 48(%r12), %rdx + jne .L731 + cmpq 40(%r12), %rdx + jne .L732 + cmpq 32(%r12), %rdx + jne .L733 + cmpq 24(%r12), %rdx + jne .L734 + cmpq 16(%r12), %rdx + jne .L735 + cmpq 8(%r12), %rdx + jne .L736 + movq (%r12), %rax + cmpq %rax, %rdx + jne .L788 +.L672: + vpbroadcastq %rax, %zmm2 +.L785: + movl $1, -144(%rbp) +.L667: + cmpq $0, -128(%rbp) + je .L789 + leaq -8(%r15), %rdx + movq %rdx, %r9 + leaq 0(%r13,%rdx,8), %r10 + movq %rdx, %rcx + andl $31, %r9d + vmovdqu64 (%r10), %zmm16 + movq %r9, -120(%rbp) + andl $24, %ecx + je .L677 + vmovdqu64 0(%r13), %zmm1 + leaq _ZZN3hwy11N_AVX3_ZEN4L8CompressIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array(%rip), %r14 + vmovdqa64 .LC9(%rip), %zmm0 + movq $-1, %rdi + vpcmpq $6, %zmm2, %zmm1, %k1 + knotb %k1, %k4 + kmovb %k4, %eax + movq %rax, %rsi + kmovb %k1, %ecx + vpbroadcastq (%r14,%rsi,8), %zmm3 + movzbl %cl, %esi + popcntq %rax, %rax + xorl %ecx, %ecx + bzhi %rax, %rdi, %r8 + kmovb %r8d, %k6 + leaq 0(%r13,%rax,8), %rax + popcntq %rsi, %rcx + vpsrlvq %zmm0, %zmm3, %zmm3 + vpermq %zmm1, %zmm3, %zmm3 + vmovdqu64 %zmm3, 0(%r13){%k6} + vpbroadcastq (%r14,%rsi,8), %zmm3 + vpsrlvq %zmm0, %zmm3, %zmm3 + vpermq %zmm1, %zmm3, %zmm1 + vmovdqu64 %zmm1, (%r12) + testb $16, %dl + je .L678 + vmovdqu64 64(%r13), %zmm1 + vpcmpq $6, %zmm2, %zmm1, %k1 + knotb %k1, %k4 + kmovb %k4, %esi + movq %rsi, %r10 + kmovb %k1, %r8d + vpbroadcastq (%r14,%r10,8), %zmm3 + popcntq %rsi, %rsi + bzhi %rsi, %rdi, %r11 + kmovb %r11d, %k7 + vpsrlvq %zmm0, %zmm3, %zmm3 + vpermq %zmm1, %zmm3, %zmm3 + vmovdqu64 %zmm3, (%rax){%k7} + leaq (%rax,%rsi,8), %rax + movzbl %r8b, %esi + xorl %r8d, %r8d + vpbroadcastq (%r14,%rsi,8), %zmm3 + popcntq %rsi, %r8 + vpsrlvq %zmm0, %zmm3, %zmm3 + vpermq %zmm1, %zmm3, %zmm1 + vmovdqu64 %zmm1, (%r12,%rcx,8) + addq %r8, %rcx + cmpq $23, %r9 + jbe .L678 + vmovdqu64 128(%r13), %zmm1 + vpcmpq $6, %zmm2, %zmm1, %k1 + knotb %k1, %k4 + kmovb %k4, %esi + movq %rsi, %r10 + kmovb %k1, %r8d + vpbroadcastq (%r14,%r10,8), %zmm3 + popcntq %rsi, %rsi + bzhi %rsi, %rdi, %rdi + kmovb %edi, %k2 + vpsrlvq %zmm0, %zmm3, %zmm3 + vpermq %zmm1, %zmm3, %zmm3 + vmovdqu64 %zmm3, (%rax){%k2} + leaq (%rax,%rsi,8), %rax + movzbl %r8b, %esi + xorl %r8d, %r8d + vpbroadcastq (%r14,%rsi,8), %zmm3 + popcntq %rsi, %r8 + leaq 1(%r9), %rsi + vpsrlvq %zmm0, %zmm3, %zmm3 + vpermq %zmm1, %zmm3, %zmm1 + vmovdqu64 %zmm1, (%r12,%rcx,8) + addq %r8, %rcx + cmpq $9, %rsi + sbbq %rsi, %rsi + leaq 0(,%rcx,8), %r8 + andq $-16, %rsi + addq $24, %rsi + cmpq %rsi, %r9 + jne .L790 +.L680: + movq %rax, %r9 + movq %rdx, %rsi + subq %r13, %r9 + subq %rcx, %rsi + movq %r9, %rdi + leaq 0(%r13,%rsi,8), %r11 + sarq $3, %rdi + subq %rdi, %rdx + movq %rdi, -120(%rbp) + movq %rdx, %rcx + movq %rdx, -152(%rbp) + movq %rsi, %rdx + subq %rdi, %rdx + leaq (%rax,%rcx,8), %rdi + leaq (%rax,%rdx,8), %r10 + vmovq %rdi, %xmm17 + .p2align 4,,10 + .p2align 3 +.L682: + cmpl $8, %r8d + jnb .L684 + testl %r8d, %r8d + jne .L791 +.L685: + cmpl $8, %r8d + jnb .L688 + testl %r8d, %r8d + jne .L792 +.L689: + testq %rdx, %rdx + je .L739 +.L703: + leaq 256(%rax), %rsi + leaq -256(%r10), %rdi + vmovdqu64 (%rax), %zmm15 + vmovdqu64 64(%rax), %zmm14 + vmovdqu64 128(%rax), %zmm13 + vmovdqu64 192(%rax), %zmm12 + vmovdqu64 -128(%r10), %zmm9 + vmovdqu64 -64(%r10), %zmm8 + vmovdqu64 -256(%r10), %zmm11 + vmovdqu64 -192(%r10), %zmm10 + cmpq %rdi, %rsi + je .L740 + xorl %ecx, %ecx + leaq _ZZN3hwy11N_AVX3_ZEN4L11CompressNotIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array(%rip), %r9 + movl $8, %r8d + jmp .L696 + .p2align 4,,10 + .p2align 3 +.L794: + vmovdqu64 -128(%rdi), %zmm3 + vmovdqu64 -64(%rdi), %zmm1 + prefetcht0 -1024(%rdi) + subq $256, %rdi + vmovdqu64 (%rdi), %zmm5 + vmovdqu64 64(%rdi), %zmm4 +.L695: + vpcmpq $6, %zmm2, %zmm5, %k2 + vpcmpq $6, %zmm2, %zmm4, %k7 + vpcmpq $6, %zmm2, %zmm3, %k6 + vpcmpq $6, %zmm2, %zmm1, %k5 + kmovb %k2, %r11d + vpbroadcastq (%r9,%r11,8), %zmm18 + movq %r11, %r10 + leaq -8(%rdx,%rcx), %r11 + popcntq %r10, %r10 + vpsrlvq %zmm0, %zmm18, %zmm18 + vpermq %zmm5, %zmm18, %zmm5 + vmovdqu64 %zmm5, (%rax,%rcx,8) + addq $8, %rcx + vmovdqu64 %zmm5, (%rax,%r11,8) + kmovb %k7, %r11d + subq %r10, %rcx + vpbroadcastq (%r9,%r11,8), %zmm5 + movq %r11, %r10 + leaq -16(%rdx,%rcx), %r11 + vpsrlvq %zmm0, %zmm5, %zmm5 + popcntq %r10, %r10 + vpermq %zmm4, %zmm5, %zmm4 + vmovdqu64 %zmm4, (%rax,%rcx,8) + vmovdqu64 %zmm4, (%rax,%r11,8) + movq %r8, %r11 + subq %r10, %r11 + addq %r11, %rcx + kmovb %k6, %r11d + vpbroadcastq (%r9,%r11,8), %zmm4 + movq %r11, %r10 + leaq -24(%rdx,%rcx), %r11 + popcntq %r10, %r10 + subq $32, %rdx + vpsrlvq %zmm0, %zmm4, %zmm4 + vpermq %zmm3, %zmm4, %zmm3 + vmovdqu64 %zmm3, (%rax,%rcx,8) + vmovdqu64 %zmm3, (%rax,%r11,8) + movq %r8, %r11 + subq %r10, %r11 + addq %rcx, %r11 + kmovb %k5, %ecx + vpbroadcastq (%r9,%rcx,8), %zmm3 + movq %rcx, %r10 + leaq (%r11,%rdx), %rcx + popcntq %r10, %r10 + vpsrlvq %zmm0, %zmm3, %zmm3 + vpermq %zmm1, %zmm3, %zmm1 + vmovdqu64 %zmm1, (%rax,%r11,8) + vmovdqu64 %zmm1, (%rax,%rcx,8) + movq %r8, %rcx + subq %r10, %rcx + addq %r11, %rcx + cmpq %rdi, %rsi + je .L793 +.L696: + movq %rsi, %r10 + subq %rax, %r10 + sarq $3, %r10 + subq %rcx, %r10 + cmpq $32, %r10 + ja .L794 + vmovdqu64 (%rsi), %zmm5 + vmovdqu64 64(%rsi), %zmm4 + prefetcht0 1024(%rsi) + addq $256, %rsi + vmovdqu64 -128(%rsi), %zmm3 + vmovdqu64 -64(%rsi), %zmm1 + jmp .L695 + .p2align 4,,10 + .p2align 3 +.L787: + movl $8, %eax + subq %rdi, %rax + leaq 0(%r13,%rax,8), %r11 + leaq -8(%rdi,%rdx), %rax + movq %rax, -120(%rbp) + jmp .L625 + .p2align 4,,10 + .p2align 3 +.L793: + leaq (%rdx,%rcx), %r8 + leaq (%rax,%rcx,8), %r10 + addq $8, %rcx +.L693: + vpcmpq $6, %zmm2, %zmm15, %k4 + vpcmpq $6, %zmm2, %zmm14, %k3 + xorl %esi, %esi + vpcmpq $6, %zmm2, %zmm13, %k1 + kmovb %k4, %edi + vpbroadcastq (%r9,%rdi,8), %zmm1 + popcntq %rdi, %rsi + kmovb %k3, %edi + subq %rsi, %rcx + movq %rdi, %rsi + vpsrlvq %zmm0, %zmm1, %zmm1 + popcntq %rsi, %rsi + vpcmpq $6, %zmm2, %zmm12, %k4 + vpcmpq $6, %zmm2, %zmm11, %k3 + vpermq %zmm15, %zmm1, %zmm15 + vpbroadcastq (%r9,%rdi,8), %zmm1 + leaq -16(%rdx,%rcx), %rdi + vmovdqu64 %zmm15, (%r10) + vpsrlvq %zmm0, %zmm1, %zmm1 + vmovdqu64 %zmm15, -64(%rax,%r8,8) + vpermq %zmm14, %zmm1, %zmm14 + vmovdqu64 %zmm14, (%rax,%rcx,8) + subq %rsi, %rcx + vmovdqu64 %zmm14, (%rax,%rdi,8) + kmovb %k1, %edi + addq $8, %rcx + vpbroadcastq (%r9,%rdi,8), %zmm1 + movq %rdi, %rsi + vpcmpq $6, %zmm2, %zmm10, %k1 + leaq -24(%rdx,%rcx), %rdi + vpsrlvq %zmm0, %zmm1, %zmm1 + vpermq %zmm13, %zmm1, %zmm13 + vmovdqu64 %zmm13, (%rax,%rcx,8) + vmovdqu64 %zmm13, (%rax,%rdi,8) + xorl %edi, %edi + popcntq %rsi, %rdi + movl $8, %esi + movq %rsi, %r8 + subq %rdi, %r8 + kmovb %k4, %edi + vpbroadcastq (%r9,%rdi,8), %zmm1 + vpcmpq $6, %zmm2, %zmm9, %k4 + addq %rcx, %r8 + movq %rdi, %rcx + vpsrlvq %zmm0, %zmm1, %zmm1 + leaq -32(%rdx,%r8), %rdi + popcntq %rcx, %rcx + vpermq %zmm12, %zmm1, %zmm12 + vmovdqu64 %zmm12, (%rax,%r8,8) + vmovdqu64 %zmm12, (%rax,%rdi,8) + movq %rsi, %rdi + subq %rcx, %rdi + addq %r8, %rdi + kmovb %k3, %r8d + vpbroadcastq (%r9,%r8,8), %zmm1 + movq %r8, %rcx + leaq -40(%rdx,%rdi), %r8 + popcntq %rcx, %rcx + vpcmpq $6, %zmm2, %zmm8, %k3 + vpsrlvq %zmm0, %zmm1, %zmm1 + vpermq %zmm11, %zmm1, %zmm11 + vmovdqu64 %zmm11, (%rax,%rdi,8) + vmovdqu64 %zmm11, (%rax,%r8,8) + movq %rsi, %r8 + subq %rcx, %r8 + addq %rdi, %r8 + kmovb %k1, %edi + vpbroadcastq (%r9,%rdi,8), %zmm1 + movq %rdi, %rcx + leaq -48(%rdx,%r8), %rdi + popcntq %rcx, %rcx + vpsrlvq %zmm0, %zmm1, %zmm1 + vpermq %zmm10, %zmm1, %zmm10 + vmovdqu64 %zmm10, (%rax,%r8,8) + vmovdqu64 %zmm10, (%rax,%rdi,8) + movq %rsi, %rdi + subq %rcx, %rdi + addq %r8, %rdi + kmovb %k4, %r8d + vpbroadcastq (%r9,%r8,8), %zmm1 + movq %r8, %rcx + leaq -56(%rdx,%rdi), %r8 + popcntq %rcx, %rcx + vpsrlvq %zmm0, %zmm1, %zmm1 + vpermq %zmm9, %zmm1, %zmm9 + vmovdqu64 %zmm9, (%rax,%rdi,8) + vmovdqu64 %zmm9, (%rax,%r8,8) + movq %rsi, %r8 + subq %rcx, %r8 + xorl %ecx, %ecx + addq %rdi, %r8 + kmovb %k3, %edi + vpbroadcastq (%r9,%rdi,8), %zmm1 + popcntq %rdi, %rcx + leaq -64(%rdx,%r8), %rdx + subq %rcx, %rsi + vpsrlvq %zmm0, %zmm1, %zmm1 + vpermq %zmm8, %zmm1, %zmm8 + vmovdqu64 %zmm8, (%rax,%r8,8) + vmovdqu64 %zmm8, (%rax,%rdx,8) + leaq (%rsi,%r8), %rdx + movq -152(%rbp), %rsi + leaq (%rax,%rdx,8), %rcx + subq %rdx, %rsi +.L692: + movq %rcx, %rdi + cmpq $7, %rsi + ja .L697 + movq -152(%rbp), %rdi + leaq -64(%rax,%rdi,8), %rdi +.L697: + vpcmpq $6, %zmm2, %zmm16, %k3 + vmovdqu64 (%rdi), %zmm7 + vmovq %xmm17, %rsi + vmovdqu64 %zmm7, (%rsi) + movq $-1, %rsi + knotb %k3, %k4 + kmovb %k4, %edi + movq %rdi, %r10 + kmovb %k3, %r8d + vpbroadcastq (%r14,%r10,8), %zmm1 + popcntq %rdi, %rdi + addq %rdi, %rdx + bzhi %rdi, %rsi, %r11 + kmovb %r11d, %k3 + vpsrlvq %zmm0, %zmm1, %zmm1 + vpermq %zmm16, %zmm1, %zmm1 + vmovdqu64 %zmm1, (%rcx){%k3} + vpbroadcastq (%r14,%r8,8), %zmm1 + movzbl %r8b, %ecx + popcntq %rcx, %rcx + bzhi %rcx, %rsi, %rsi + kmovb %esi, %k4 + vpsrlvq %zmm0, %zmm1, %zmm0 + vpermq %zmm16, %zmm0, %zmm0 + vmovdqu64 %zmm0, (%rax,%rdx,8){%k4} + movq -120(%rbp), %r14 + movq -128(%rbp), %r9 + addq %rdx, %r14 + subq $1, %r9 + cmpl $2, -144(%rbp) + je .L795 + movq -136(%rbp), %rsi + movq %rbx, %r8 + movq %r12, %rcx + movq %r14, %rdx + movq %r13, %rdi + movq %r9, -120(%rbp) + vzeroupper + call _ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + cmpl $3, -144(%rbp) + movq -120(%rbp), %r9 + je .L781 +.L699: + movq %r15, %rdx + movq -136(%rbp), %rsi + movq %rbx, %r8 + movq %r12, %rcx + subq %r14, %rdx + leaq 0(%r13,%r14,8), %rdi + call _ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L781: + subq $-128, %rsp + popq %rbx + popq %r10 + .cfi_remember_state + .cfi_def_cfa 10, 0 + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + leaq -8(%r10), %rsp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L688: + .cfi_restore_state + movq (%r12), %rcx + movl %r8d, %esi + leaq 8(%r11), %rdi + andq $-8, %rdi + movq %rcx, (%r11) + movq -8(%r12,%rsi), %rcx + movq %rcx, -8(%r11,%rsi) + movq %r11, %rcx + movq %r12, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r8d, %ecx + shrl $3, %ecx + rep movsq + jmp .L689 + .p2align 4,,10 + .p2align 3 +.L684: + movq (%r11), %rcx + movl %r8d, %esi + leaq 8(%rax), %rdi + andq $-8, %rdi + movq %rcx, (%rax) + movq -8(%r11,%rsi), %rcx + movq %rcx, -8(%rax,%rsi) + movq %rax, %rcx + movq %r11, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r8d, %ecx + shrl $3, %ecx + rep movsq + jmp .L685 + .p2align 4,,10 + .p2align 3 +.L790: + subq %rsi, -120(%rbp) + leaq 0(%r13,%rsi,8), %r10 + leaq (%r12,%r8), %rsi +.L702: + movq -120(%rbp), %r9 + movq $-1, %rdi + bzhi %r9, %rdi, %rdi + movzbl %dil, %edi + kmovd %edi, %k1 +.L683: + vmovdqu64 (%r10), %zmm1 + movq $-1, %rdi + vpcmpq $6, %zmm2, %zmm1, %k0 + kandnb %k1, %k0, %k4 + kmovb %k4, %r8d + movq %r8, %r9 + popcntq %r8, %r8 + vpbroadcastq (%r14,%r9,8), %zmm3 + bzhi %r8, %rdi, %rdi + kmovb %edi, %k5 + kandb %k1, %k0, %k1 + kmovb %k1, %edi + vpsrlvq %zmm0, %zmm3, %zmm3 + vpermq %zmm1, %zmm3, %zmm3 + vmovdqu64 %zmm3, (%rax){%k5} + leaq (%rax,%r8,8), %rax + xorl %r8d, %r8d + vpbroadcastq (%r14,%rdi,8), %zmm3 + movq %rax, %r9 + popcntq %rdi, %r8 + addq %rcx, %r8 + movq %rdx, %rcx + subq %r13, %r9 + vpsrlvq %zmm0, %zmm3, %zmm3 + subq %r8, %rcx + salq $3, %r8 + movq %r9, %rdi + leaq 0(%r13,%rcx,8), %r11 + sarq $3, %rdi + vpermq %zmm1, %zmm3, %zmm1 + subq %rdi, %rdx + vmovdqu64 %zmm1, (%rsi) + movq %rdx, -152(%rbp) + movq %rdx, %rsi + movq %rcx, %rdx + subq %rdi, %rdx + movq %rdi, -120(%rbp) + leaq (%rax,%rsi,8), %rdi + leaq (%rax,%rdx,8), %r10 + vmovq %rdi, %xmm17 + jmp .L682 + .p2align 4,,10 + .p2align 3 +.L792: + movzbl (%r12), %ecx + movb %cl, (%r11) + jmp .L689 + .p2align 4,,10 + .p2align 3 +.L791: + movzbl (%r11), %ecx + movb %cl, (%rax) + jmp .L685 + .p2align 4,,10 + .p2align 3 +.L678: + movq -120(%rbp), %r9 + leaq 0(,%rcx,8), %r8 + leaq -8(%r9), %rsi + leaq 1(%r9), %rdi + andq $-8, %rsi + addq $8, %rsi + cmpq $8, %rdi + movl $8, %edi + cmovbe %rdi, %rsi + cmpq %rsi, %r9 + je .L680 + subq %rsi, -120(%rbp) + movq -120(%rbp), %rdi + leaq 0(%r13,%rsi,8), %r10 + leaq (%r12,%r8), %rsi + cmpq $255, %rdi + jbe .L702 + movl $255, %edi + kmovd %edi, %k1 + jmp .L683 + .p2align 4,,10 + .p2align 3 +.L786: + cmpq $1, %rdx + jbe .L781 + leaq 1024(%rdi), %rax + cmpq %rax, %rsi + jb .L796 + movl $8, %esi + call _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + jmp .L781 + .p2align 4,,10 + .p2align 3 +.L627: + vmovdqu64 0(%r13), %zmm6 + movl $8, %esi + movq $-1, %rax + subq -144(%rbp), %rsi + bzhi %rsi, %rax, %rax + kmovb %eax, %k1 + vpcmpq $4, %zmm5, %zmm6, %k0 + kandb %k1, %k0, %k0 + kmovb %k0, %eax + kortestb %k0, %k0 + jne .L797 + vpxor %xmm4, %xmm4, %xmm4 + leaq 1024(%r13,%rsi,8), %rdi + vmovdqa64 %zmm4, %zmm3 + .p2align 4,,10 + .p2align 3 +.L633: + movq %rsi, %rcx + subq $-128, %rsi + cmpq %rsi, %r15 + jb .L637 + leaq -1024(%rdi), %rax +.L632: + vpxord (%rax), %zmm2, %zmm0 + vpxord 64(%rax), %zmm2, %zmm1 + leaq 128(%rax), %rdx + vpord %zmm3, %zmm0, %zmm3 + vpord %zmm4, %zmm1, %zmm4 + vpxord 128(%rax), %zmm2, %zmm0 + vpxord 192(%rax), %zmm2, %zmm1 + vpord %zmm3, %zmm0, %zmm3 + vpxord 256(%rax), %zmm2, %zmm0 + vpord %zmm4, %zmm1, %zmm4 + vpxord 320(%rax), %zmm2, %zmm1 + leaq 384(%rdx), %rax + vpord %zmm3, %zmm0, %zmm3 + vpxord 256(%rdx), %zmm2, %zmm0 + vpord %zmm4, %zmm1, %zmm4 + vpxord 320(%rdx), %zmm2, %zmm1 + vpord %zmm3, %zmm0, %zmm0 + vpord %zmm4, %zmm1, %zmm1 + vmovdqa64 %zmm0, %zmm3 + vmovdqa64 %zmm1, %zmm4 + cmpq %rdi, %rax + jne .L632 + vpord %zmm1, %zmm0, %zmm0 + leaq 1408(%rdx), %rdi + vptestnmq %zmm0, %zmm0, %k0 + kortestb %k0, %k0 + setc %al + testb %al, %al + jne .L633 + vmovdqa64 0(%r13,%rcx,8), %zmm6 + vpcmpq $4, %zmm5, %zmm6, %k0 + kortestb %k0, %k0 + jne .L635 + .p2align 4,,10 + .p2align 3 +.L634: + addq $8, %rcx + vmovdqa64 0(%r13,%rcx,8), %zmm7 + vpcmpq $4, %zmm5, %zmm7, %k0 + kortestb %k0, %k0 + je .L634 +.L635: + kmovb %k0, %eax + tzcntl %eax, %eax + addq %rcx, %rax +.L631: + vpbroadcastq 0(%r13,%rax,8), %zmm1 + leaq 0(%r13,%rax,8), %rdi + vpcmpq $6, %zmm5, %zmm1, %k0 + kortestb %k0, %k0 + jne .L640 + leaq -8(%r15), %rax + xorl %ecx, %ecx + jmp .L646 + .p2align 4,,10 + .p2align 3 +.L641: + kmovb %k0, %edx + popcntq %rdx, %rdx + addq %rdx, %rcx + leaq -8(%rax), %rdx + vmovdqu64 %zmm5, 0(%r13,%rax,8) + cmpq %rdx, %r15 + jbe .L798 + movq %rdx, %rax +.L646: + vmovdqu64 0(%r13,%rax,8), %zmm6 + vpcmpq $0, %zmm1, %zmm6, %k1 + vpcmpq $0, %zmm5, %zmm6, %k0 + kmovb %k1, %edx + kmovb %k0, %esi + korb %k0, %k1, %k1 + kortestb %k1, %k1 + jc .L641 + kmovb %edx, %k1 + kmovb %esi, %k4 + kxnorb %k4, %k1, %k1 + kmovb %k1, %edx + tzcntl %edx, %edx + leaq 8(%rax), %rsi + addq %rax, %rdx + addq $16, %rax + vpbroadcastq 0(%r13,%rdx,8), %zmm0 + movq %r15, %rdx + subq %rcx, %rdx + vmovdqa64 %zmm0, -112(%rbp) + cmpq %rdx, %rax + ja .L642 + .p2align 4,,10 + .p2align 3 +.L643: + vmovdqu64 %zmm1, -64(%r13,%rax,8) + movq %rax, %rsi + addq $8, %rax + cmpq %rax, %rdx + jnb .L643 +.L642: + subq %rsi, %rdx + leaq 0(%r13,%rsi,8), %rcx + movl $255, %eax + cmpq $255, %rdx + ja .L644 + movq $-1, %rax + bzhi %rdx, %rax, %rax + movzbl %al, %eax +.L644: + kmovb %eax, %k2 + vmovdqu64 %zmm1, (%rcx){%k2} +.L645: + vpbroadcastq (%r12), %zmm2 + vpcmpq $0, .LC8(%rip), %zmm2, %k0 + kortestb %k0, %k0 + jc .L725 + vpcmpq $0, .LC7(%rip), %zmm2, %k0 + kortestb %k0, %k0 + jc .L663 + vpminsq %zmm0, %zmm1, %zmm3 + vpcmpq $6, %zmm3, %zmm2, %k0 + kortestb %k0, %k0 + jne .L799 + vmovdqa64 %zmm2, %zmm0 + movl $128, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L658: + leaq (%rcx,%rax,8), %rdx + addq $1, %rax + vpminsq 0(%r13,%rdx,8), %zmm0, %zmm0 + cmpq $16, %rax + jne .L658 + vpcmpq $6, %zmm0, %zmm2, %k0 + kortestb %k0, %k0 + jne .L785 + leaq 128(%rsi), %rax + cmpq %rax, %r15 + jb .L665 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L658 + .p2align 4,,10 + .p2align 3 +.L637: + movq %rcx, %rdx + addq $8, %rcx + cmpq %rcx, %r15 + jb .L800 + vmovdqa64 -64(%r13,%rcx,8), %zmm6 + vpcmpq $4, %zmm5, %zmm6, %k0 + kmovb %k0, %eax + kortestb %k0, %k0 + je .L637 +.L783: + tzcntl %eax, %eax + addq %rdx, %rax + jmp .L631 + .p2align 4,,10 + .p2align 3 +.L677: + cmpq $0, -120(%rbp) + je .L801 + movq %r12, %rsi + movq %r13, %r10 + leaq _ZZN3hwy11N_AVX3_ZEN4L8CompressIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array(%rip), %r14 + movq %r13, %rax + vmovdqa64 .LC9(%rip), %zmm0 + jmp .L702 + .p2align 4,,10 + .p2align 3 +.L795: + vzeroupper + jmp .L699 + .p2align 4,,10 + .p2align 3 +.L739: + movq -152(%rbp), %rsi + movq %rax, %rcx + jmp .L692 + .p2align 4,,10 + .p2align 3 +.L740: + movq %rax, %r10 + movq %rdx, %r8 + movl $8, %ecx + leaq _ZZN3hwy11N_AVX3_ZEN4L11CompressNotIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array(%rip), %r9 + jmp .L693 +.L789: + leaq -1(%r15), %rbx + movq %rbx, %r12 + shrq %r12 + .p2align 4,,10 + .p2align 3 +.L675: + movq %r12, %rdx + movq %r15, %rsi + movq %r13, %rdi + call _ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + subq $1, %r12 + jnb .L675 + .p2align 4,,10 + .p2align 3 +.L676: + movq 0(%r13,%rbx,8), %rdx + movq 0(%r13), %rax + movq %rbx, %rsi + movq %r13, %rdi + movq %rdx, 0(%r13) + xorl %edx, %edx + movq %rax, 0(%r13,%rbx,8) + call _ZN3hwy11N_AVX3_ZEN46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + subq $1, %rbx + jne .L676 + jmp .L781 + .p2align 4,,10 + .p2align 3 +.L666: + vpcmpq $6, -64(%r13,%rsi,8), %zmm2, %k0 + kortestb %k0, %k0 + jne .L785 +.L665: + movq %rsi, %rax + addq $8, %rsi + cmpq %rsi, %r15 + jnb .L666 + cmpq %rax, %r15 + je .L725 + vpcmpq $6, -64(%r13,%r15,8), %zmm2, %k0 + xorl %eax, %eax + kortestb %k0, %k0 + sete %al + addl $1, %eax + movl %eax, -144(%rbp) + jmp .L667 +.L727: + movl $12, %eax + movl $11, %esi + jmp .L673 + .p2align 4,,10 + .p2align 3 +.L674: + cmpq $23, %rax + je .L784 +.L673: + movq %rax, %rcx + addq $1, %rax + cmpq (%r12,%rax,8), %rdx + je .L674 + movl $12, %edi + subq $11, %rcx + movq %rdx, %rax + subq %rsi, %rdi + cmpq %rdi, %rcx + jb .L672 +.L784: + movq (%r12,%rsi,8), %rax + jmp .L672 +.L728: + movl $9, %esi + movl $10, %eax + jmp .L673 +.L729: + movl $8, %esi + movl $9, %eax + jmp .L673 +.L730: + movl $7, %esi + movl $8, %eax + jmp .L673 +.L731: + movl $6, %esi + movl $7, %eax + jmp .L673 +.L732: + movl $5, %esi + movl $6, %eax + jmp .L673 +.L640: + movq %r15, %rsi + leaq -112(%rbp), %rdx + vmovdqa64 %zmm5, %zmm0 + movq %r12, %rcx + subq %rax, %rsi + call _ZN3hwy11N_AVX3_ZEN46detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L781 + vmovdqa64 -112(%rbp), %zmm0 + jmp .L645 +.L733: + movl $4, %esi + movl $5, %eax + jmp .L673 +.L734: + movl $3, %esi + movl $4, %eax + jmp .L673 +.L735: + movl $2, %esi + movl $3, %eax + jmp .L673 +.L736: + movl $1, %esi + movl $2, %eax + jmp .L673 +.L788: + xorl %esi, %esi + movl $1, %eax + jmp .L673 +.L798: + movl $255, %edi + vmovdqu64 0(%r13), %zmm0 + kmovd %edi, %k2 + cmpq $255, %rax + ja .L647 + movq $-1, %rdx + bzhi %rax, %rdx, %rdx + movzbl %dl, %edi + kmovd %edi, %k2 +.L647: + vpcmpq $0, %zmm1, %zmm0, %k0 + vpcmpq $0, %zmm5, %zmm0, %k1 + movq %r15, %rsi + kandb %k2, %k1, %k1 + knotb %k2, %k2 + korb %k1, %k0, %k0 + korb %k2, %k0, %k0 + kortestb %k0, %k0 + setc %dl + subq %rcx, %rsi + testb %dl, %dl + je .L802 + movq %rsi, %rdx + kmovb %k1, %eax + popcntq %rax, %rax + vmovdqu64 %zmm5, 0(%r13) + subq %rax, %rdx + cmpq $7, %rdx + jbe .L652 + leaq -8(%rdx), %rcx + movq -152(%rbp), %rsi + movq %rcx, %rax + shrq $3, %rax + salq $6, %rax + leaq 64(%r13,%rax), %rax + .p2align 4,,10 + .p2align 3 +.L653: + vmovdqu64 %zmm1, (%rsi) + addq $64, %rsi + cmpq %rsi, %rax + jne .L653 + andq $-8, %rcx + vmovdqa64 %zmm1, (%r12) + leaq 8(%rcx), %rax + leaq 0(%r13,%rax,8), %r13 + subq %rax, %rdx + movl $255, %eax + kmovd %eax, %k1 + cmpq $255, %rdx + jbe .L704 +.L654: + vmovdqu64 (%r12), %zmm0{%k1}{z} + vmovdqu64 %zmm0, 0(%r13){%k1} + vzeroupper + jmp .L781 +.L670: + movl $11, %eax + movl $10, %esi + jmp .L673 +.L800: + leaq -8(%r15), %rdx + vmovdqu64 0(%r13,%rdx,8), %zmm7 + vpcmpq $4, %zmm5, %zmm7, %k0 + kmovb %k0, %eax + kortestb %k0, %k0 + jne .L783 + vzeroupper + jmp .L781 +.L662: + vmovdqu64 -64(%r13,%rsi,8), %zmm7 + vpcmpq $6, %zmm2, %zmm7, %k0 + kortestb %k0, %k0 + jne .L785 +.L661: + movq %rsi, %rax + addq $8, %rsi + cmpq %rsi, %r15 + jnb .L662 + cmpq %rax, %r15 + je .L663 + vmovdqu64 -64(%r13,%r15,8), %zmm7 + vpcmpq $6, %zmm2, %zmm7, %k0 + kortestb %k0, %k0 + jne .L785 +.L663: + movl $3, -144(%rbp) + vpternlogd $0xFF, %zmm0, %zmm0, %zmm0 + vpaddq %zmm0, %zmm2, %zmm2 + jmp .L667 +.L797: + tzcntl %eax, %eax + jmp .L631 +.L796: + movq %rdi, %rcx + movq %r12, %rdi + cmpq $7, %rdx + jbe .L618 + leaq -8(%rdx), %rdx + movq 0(%r13), %rcx + leaq 8(%r12), %rdi + movq %rdx, %rax + andq $-8, %rdi + shrq $3, %rax + movq %rcx, (%r12) + addq $1, %rax + salq $6, %rax + movl %eax, %ecx + movq -8(%r13,%rcx), %rsi + movq %rsi, -8(%r12,%rcx) + movq %r12, %rcx + movq %r13, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %eax, %ecx + movq %rdx, %rax + movq %r15, %rdx + andq $-8, %rax + shrl $3, %ecx + rep movsq + addq $8, %rax + leaq 0(,%rax,8), %rcx + subq %rax, %rdx + movl $255, %eax + leaq (%r12,%rcx), %rdi + addq %r13, %rcx + cmpq $255, %rdx + jbe .L618 +.L619: + leal -1(%r15), %edx + movl $32, %r8d + movl $1, %esi + kmovb %eax, %k7 + bsrl %edx, %edx + vmovdqu64 (%rcx), %zmm0{%k7}{z} + movq %r15, %rax + xorl $31, %edx + subl %edx, %r8d + movl $1, %edx + vmovdqu64 %zmm0, (%rdi){%k7} + vpbroadcastq .LC10(%rip), %zmm0 + shlx %r8, %rsi, %rsi + shrq $4, %rsi + cmove %rdx, %rsi + movq %rsi, %rdx + salq $4, %rdx + addq $8, %rdx + cmpq %rdx, %r15 + jnb .L623 + .p2align 4,,10 + .p2align 3 +.L620: + vmovdqu64 %zmm0, (%r12,%rax,8) + addq $8, %rax + cmpq %rdx, %rax + jb .L620 +.L623: + movq %r12, %rdi + vzeroupper + call _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + cmpq $7, %r15 + jbe .L622 + leaq -8(%r15), %rdx + movq (%r12), %rcx + leaq 8(%r13), %rdi + movq %rdx, %rax + andq $-8, %rdi + shrq $3, %rax + movq %rcx, 0(%r13) + addq $1, %rax + salq $6, %rax + movl %eax, %ecx + movq -8(%r12,%rcx), %rsi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + movq %r12, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %eax, %ecx + movq %rdx, %rax + andq $-8, %rax + shrl $3, %ecx + rep movsq + addq $8, %rax + leaq 0(,%rax,8), %rdx + subq %rax, %r15 + movl $255, %eax + addq %rdx, %r13 + addq %rdx, %r12 + cmpq $255, %r15 + jbe .L622 +.L624: + kmovb %eax, %k7 + vmovdqu64 (%r12), %zmm0{%k7}{z} + vmovdqu64 %zmm0, 0(%r13){%k7} + vzeroupper + jmp .L781 +.L652: + vmovdqa64 %zmm1, (%r12) +.L704: + movq $-1, %rax + bzhi %rdx, %rax, %rax + movzbl %al, %eax + kmovd %eax, %k1 + jmp .L654 +.L801: + movq %rdx, -152(%rbp) + vmovq %r10, %xmm17 + movq %r13, %rax + vmovdqa64 .LC9(%rip), %zmm0 + leaq _ZZN3hwy11N_AVX3_ZEN4L8CompressIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array(%rip), %r14 + jmp .L703 +.L725: + movl $2, -144(%rbp) + jmp .L667 +.L799: + vpmaxsq %zmm0, %zmm1, %zmm1 + vpcmpq $6, %zmm2, %zmm1, %k0 + kortestb %k0, %k0 + jne .L785 + vmovdqa64 %zmm2, %zmm0 + movl $128, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L659: + leaq (%rcx,%rax,8), %rdx + addq $1, %rax + vpmaxsq 0(%r13,%rdx,8), %zmm0, %zmm0 + cmpq $16, %rax + jne .L659 + vpcmpq $6, %zmm2, %zmm0, %k0 + kortestb %k0, %k0 + jne .L785 + leaq 128(%rsi), %rax + cmpq %rax, %r15 + jb .L661 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L659 +.L618: + movq $-1, %rax + bzhi %rdx, %rax, %rax + movzbl %al, %eax + jmp .L619 +.L622: + movq $-1, %rax + bzhi %r15, %rax, %rax + movzbl %al, %eax + jmp .L624 +.L802: + knotb %k0, %k1 + kmovb %k1, %edx + tzcntl %edx, %edx + vpbroadcastq 0(%r13,%rdx,8), %zmm0 + leaq 8(%rax), %rdx + vmovdqa64 %zmm0, -112(%rbp) + cmpq %rsi, %rdx + ja .L649 +.L650: + vmovdqu64 %zmm1, -64(%r13,%rdx,8) + movq %rdx, %rax + addq $8, %rdx + cmpq %rsi, %rdx + jbe .L650 +.L649: + movq %rsi, %rdx + leaq 0(%r13,%rax,8), %rcx + subq %rax, %rdx + movl $-1, %eax + cmpq $255, %rdx + ja .L651 + orq $-1, %rax + bzhi %rdx, %rax, %rax +.L651: + kmovb %eax, %k7 + vmovdqu64 %zmm1, (%rcx){%k7} + jmp .L645 + .cfi_endproc +.LFE18799: + .size _ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0, .-_ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + .section .text._ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0, @function +_ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0: +.LFB18801: + .cfi_startproc + leaq 8(%rsp), %r10 + .cfi_def_cfa 10, 0 + andq $-64, %rsp + pushq -8(%r10) + pushq %rbp + movq %rsp, %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + pushq %r15 + .cfi_escape 0x10,0xf,0x2,0x76,0x78 + movq %rdx, %r15 + pushq %r14 + pushq %r13 + .cfi_escape 0x10,0xe,0x2,0x76,0x70 + .cfi_escape 0x10,0xd,0x2,0x76,0x68 + movq %rdi, %r13 + pushq %r12 + .cfi_escape 0x10,0xc,0x2,0x76,0x60 + movq %rcx, %r12 + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x58,0x6 + pushq %rbx + addq $-128, %rsp + .cfi_escape 0x10,0x3,0x2,0x76,0x50 + movq %rsi, -128(%rbp) + movq %r9, -120(%rbp) + cmpq $128, %rdx + jbe .L977 + movq %rdi, %r11 + movq %rdi, -152(%rbp) + movq %r8, %rbx + shrq $3, %r11 + movq %r11, %rdi + andl $7, %edi + movq %rdi, -144(%rbp) + jne .L978 + movq %rdx, -136(%rbp) + movq %r13, %r11 +.L816: + movq 8(%rbx), %rdx + movq 16(%rbx), %r9 + movq %rdx, %rsi + leaq 1(%r9), %rdi + leaq (%rdx,%rdx,8), %rcx + xorq (%rbx), %rdi + shrq $11, %rsi + rorx $40, %rdx, %rax + leaq 2(%r9), %rdx + addq %rdi, %rax + xorq %rsi, %rcx + movq %rax, %r8 + rorx $40, %rax, %rsi + xorq %rdx, %rcx + shrq $11, %r8 + leaq (%rax,%rax,8), %rdx + leaq 3(%r9), %rax + addq %rcx, %rsi + xorq %r8, %rdx + movq %rsi, %r8 + xorq %rax, %rdx + leaq (%rsi,%rsi,8), %rax + rorx $40, %rsi, %r10 + shrq $11, %r8 + addq %rdx, %r10 + leaq 4(%r9), %rsi + addq $5, %r9 + xorq %r8, %rax + rorx $40, %r10, %r8 + movq %r9, 16(%rbx) + xorq %rsi, %rax + movq %r10, %rsi + shrq $11, %rsi + addq %rax, %r8 + movq %rsi, %r14 + leaq (%r10,%r10,8), %rsi + leaq (%r8,%r8,8), %r10 + xorq %r14, %rsi + movq %r8, %r14 + rorx $40, %r8, %r8 + shrq $11, %r14 + xorq %r9, %rsi + movabsq $34359738359, %r9 + xorq %r14, %r10 + addq %rsi, %r8 + movl %esi, %esi + vmovq %r10, %xmm6 + movq -136(%rbp), %r10 + vpinsrq $1, %r8, %xmm6, %xmm0 + movq %r10, %r14 + vmovdqu %xmm0, (%rbx) + shrq $3, %r14 + cmpq %r9, %r10 + movl $4294967295, %r9d + movq %r14, %r8 + leaq 192(%r12), %r14 + cmova %r9, %r8 + movl %edi, %r9d + shrq $32, %rdi + imulq %r8, %r9 + imulq %r8, %rdi + imulq %r8, %rsi + shrq $32, %r9 + salq $6, %r9 + shrq $32, %rdi + vmovdqa64 (%r11,%r9), %zmm2 + movl %ecx, %r9d + shrq $32, %rcx + imulq %r8, %r9 + salq $6, %rdi + shrq $32, %rsi + imulq %r8, %rcx + salq $6, %rsi + vmovdqa64 (%r11,%rsi), %zmm5 + shrq $32, %r9 + salq $6, %r9 + shrq $32, %rcx + vmovdqa64 (%r11,%r9), %zmm3 + salq $6, %rcx + vpminsq %zmm3, %zmm2, %zmm0 + vpmaxsq (%r11,%rdi), %zmm0, %zmm0 + vpmaxsq %zmm3, %zmm2, %zmm2 + vpminsq %zmm2, %zmm0, %zmm0 + vmovdqa64 (%r11,%rcx), %zmm2 + movq %rdx, %rcx + movl %edx, %edx + shrq $32, %rcx + imulq %r8, %rdx + vmovdqa64 %zmm0, (%r12) + imulq %r8, %rcx + shrq $32, %rdx + shrq $32, %rcx + salq $6, %rdx + salq $6, %rcx + vmovdqa64 (%r11,%rcx), %zmm4 + vpminsq %zmm4, %zmm2, %zmm3 + vpmaxsq (%r11,%rdx), %zmm3, %zmm3 + movl %eax, %edx + shrq $32, %rax + imulq %r8, %rdx + vpmaxsq %zmm4, %zmm2, %zmm2 + imulq %r8, %rax + vpminsq %zmm2, %zmm3, %zmm3 + vmovdqa64 %zmm3, 64(%r12) + shrq $32, %rdx + salq $6, %rdx + shrq $32, %rax + vmovdqa64 (%r11,%rdx), %zmm4 + salq $6, %rax + vpminsq %zmm5, %zmm4, %zmm2 + vpmaxsq (%r11,%rax), %zmm2, %zmm1 + vpmaxsq %zmm5, %zmm4, %zmm4 + vpbroadcastq %xmm0, %zmm5 + vpminsq %zmm4, %zmm1, %zmm1 + vpxord %zmm0, %zmm5, %zmm0 + vpxord %zmm3, %zmm5, %zmm3 + vmovdqa64 %zmm1, 128(%r12) + vpord %zmm3, %zmm0, %zmm0 + vpxord %zmm1, %zmm5, %zmm1 + vmovdqa32 %zmm5, %zmm2 + vpord %zmm1, %zmm0, %zmm0 + vptestnmq %zmm0, %zmm0, %k0 + kortestb %k0, %k0 + jc .L818 + vpbroadcastq .LC10(%rip), %zmm0 + movl $2, %esi + movq %r12, %rdi + vmovdqu64 %zmm0, 192(%r12) + vmovdqu64 %zmm0, 256(%r12) + vzeroupper + call _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + vpbroadcastq (%r12), %zmm2 + vpbroadcastq 184(%r12), %zmm1 + vpternlogd $0xFF, %zmm0, %zmm0, %zmm0 + vpaddq %zmm0, %zmm1, %zmm0 + vpcmpq $0, %zmm0, %zmm2, %k0 + kortestb %k0, %k0 + jnc .L820 + leaq -112(%rbp), %rdx + movq %r14, %rcx + vmovdqa64 %zmm2, %zmm0 + movq %r15, %rsi + movq %r13, %rdi + call _ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L972 +.L820: + movq 96(%r12), %rdx + cmpq %rdx, 88(%r12) + jne .L918 + cmpq 80(%r12), %rdx + jne .L861 + cmpq 72(%r12), %rdx + jne .L919 + cmpq 64(%r12), %rdx + jne .L920 + cmpq 56(%r12), %rdx + jne .L921 + cmpq 48(%r12), %rdx + jne .L922 + cmpq 40(%r12), %rdx + jne .L923 + cmpq 32(%r12), %rdx + jne .L924 + cmpq 24(%r12), %rdx + jne .L925 + cmpq 16(%r12), %rdx + jne .L926 + cmpq 8(%r12), %rdx + jne .L927 + movq (%r12), %rax + cmpq %rax, %rdx + jne .L979 +.L863: + vpbroadcastq %rax, %zmm0 +.L976: + movl $1, -136(%rbp) +.L858: + cmpq $0, -120(%rbp) + je .L980 + leaq -8(%r15), %rdx + leaq 0(%r13,%rdx,8), %r10 + movq %rdx, %r9 + movq %rdx, %rcx + vmovdqu64 (%r10), %zmm6 + andl $31, %r9d + andl $24, %ecx + je .L868 + vmovdqu64 0(%r13), %zmm1 + vpcmpq $6, %zmm0, %zmm1, %k2 + knotb %k2, %k1 + vpcompressq %zmm1, 0(%r13){%k1} + kmovb %k1, %eax + kmovb %k2, %ecx + popcntq %rax, %rax + popcntq %rcx, %rcx + leaq 0(%r13,%rax,8), %rax + vpcompressq %zmm1, (%r12){%k2} + testb $16, %dl + je .L869 + vmovdqu64 64(%r13), %zmm1 + vpcmpq $6, %zmm0, %zmm1, %k1 + knotb %k1, %k2 + vpcompressq %zmm1, (%rax){%k2} + kmovb %k2, %esi + popcntq %rsi, %rsi + vpcompressq %zmm1, (%r12,%rcx,8){%k1} + leaq (%rax,%rsi,8), %rax + kmovb %k1, %esi + popcntq %rsi, %rsi + addq %rsi, %rcx + cmpq $23, %r9 + jbe .L869 + vmovdqu64 128(%r13), %zmm1 + vpcmpq $6, %zmm0, %zmm1, %k1 + knotb %k1, %k2 + vpcompressq %zmm1, (%rax){%k2} + kmovb %k2, %esi + popcntq %rsi, %rsi + vpcompressq %zmm1, (%r12,%rcx,8){%k1} + leaq (%rax,%rsi,8), %rax + kmovb %k1, %esi + popcntq %rsi, %rsi + addq %rsi, %rcx + leaq 1(%r9), %rsi + cmpq $9, %rsi + leaq 0(,%rcx,8), %r8 + sbbq %rsi, %rsi + andq $-16, %rsi + addq $24, %rsi + cmpq %rsi, %r9 + jne .L981 +.L871: + movq %rax, %r9 + movq %rdx, %rsi + movq %rdx, %rdi + subq %r13, %r9 + subq %rcx, %rsi + sarq $3, %r9 + leaq 0(%r13,%rsi,8), %r11 + subq %r9, %rdi + subq %r9, %rsi + movq %rdi, -144(%rbp) + leaq (%rax,%rdi,8), %rdi + movq %rsi, %rdx + leaq (%rax,%rsi,8), %r10 + vmovq %rdi, %xmm17 + .p2align 4,,10 + .p2align 3 +.L873: + cmpl $8, %r8d + jnb .L875 + testl %r8d, %r8d + jne .L982 +.L876: + cmpl $8, %r8d + jnb .L879 + testl %r8d, %r8d + jne .L983 +.L880: + testq %rdx, %rdx + je .L930 +.L894: + leaq 256(%rax), %rsi + leaq -256(%r10), %rdi + vmovdqu64 (%rax), %zmm15 + vmovdqu64 64(%rax), %zmm14 + vmovdqu64 128(%rax), %zmm13 + vmovdqu64 192(%rax), %zmm12 + vmovdqu64 -128(%r10), %zmm9 + vmovdqu64 -64(%r10), %zmm8 + vmovdqu64 -256(%r10), %zmm11 + vmovdqu64 -192(%r10), %zmm10 + cmpq %rdi, %rsi + je .L931 + xorl %ecx, %ecx + leaq _ZZN3hwy6N_AVX3L11CompressNotIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array(%rip), %r14 + vmovdqa64 .LC9(%rip), %zmm5 + movl $8, %r8d + jmp .L887 + .p2align 4,,10 + .p2align 3 +.L985: + vmovdqu64 -128(%rdi), %zmm2 + vmovdqu64 -64(%rdi), %zmm1 + prefetcht0 -1024(%rdi) + subq $256, %rdi + vmovdqu64 (%rdi), %zmm4 + vmovdqu64 64(%rdi), %zmm3 +.L886: + vpcmpq $6, %zmm0, %zmm4, %k5 + vpcmpq $6, %zmm0, %zmm3, %k6 + vpcmpq $6, %zmm0, %zmm2, %k7 + vpcmpq $6, %zmm0, %zmm1, %k4 + kmovb %k5, %r11d + vpbroadcastq (%r14,%r11,8), %zmm7 + movq %r11, %r10 + leaq -8(%rdx,%rcx), %r11 + popcntq %r10, %r10 + vpsrlvq %zmm5, %zmm7, %zmm7 + vpermq %zmm4, %zmm7, %zmm4 + vmovdqu64 %zmm4, (%rax,%rcx,8) + addq $8, %rcx + vmovdqu64 %zmm4, (%rax,%r11,8) + kmovb %k6, %r11d + subq %r10, %rcx + vpbroadcastq (%r14,%r11,8), %zmm4 + movq %r11, %r10 + leaq -16(%rdx,%rcx), %r11 + vpsrlvq %zmm5, %zmm4, %zmm4 + popcntq %r10, %r10 + vpermq %zmm3, %zmm4, %zmm3 + vmovdqu64 %zmm3, (%rax,%rcx,8) + vmovdqu64 %zmm3, (%rax,%r11,8) + movq %r8, %r11 + subq %r10, %r11 + leaq (%r11,%rcx), %r10 + kmovb %k7, %r11d + vpbroadcastq (%r14,%r11,8), %zmm3 + movq %r11, %rcx + leaq -24(%rdx,%r10), %r11 + popcntq %rcx, %rcx + subq $32, %rdx + vpsrlvq %zmm5, %zmm3, %zmm3 + vpermq %zmm2, %zmm3, %zmm2 + vmovdqu64 %zmm2, (%rax,%r10,8) + vmovdqu64 %zmm2, (%rax,%r11,8) + movq %r8, %r11 + subq %rcx, %r11 + kmovb %k4, %ecx + vpbroadcastq (%r14,%rcx,8), %zmm2 + addq %r10, %r11 + movq %rcx, %r10 + vpsrlvq %zmm5, %zmm2, %zmm2 + leaq (%r11,%rdx), %rcx + popcntq %r10, %r10 + vpermq %zmm1, %zmm2, %zmm1 + vmovdqu64 %zmm1, (%rax,%r11,8) + vmovdqu64 %zmm1, (%rax,%rcx,8) + movq %r8, %rcx + subq %r10, %rcx + addq %r11, %rcx + cmpq %rdi, %rsi + je .L984 +.L887: + movq %rsi, %r10 + subq %rax, %r10 + sarq $3, %r10 + subq %rcx, %r10 + cmpq $32, %r10 + ja .L985 + vmovdqu64 (%rsi), %zmm4 + vmovdqu64 64(%rsi), %zmm3 + prefetcht0 1024(%rsi) + addq $256, %rsi + vmovdqu64 -128(%rsi), %zmm2 + vmovdqu64 -64(%rsi), %zmm1 + jmp .L886 + .p2align 4,,10 + .p2align 3 +.L978: + movl $8, %eax + subq %rdi, %rax + leaq 0(%r13,%rax,8), %r11 + leaq -8(%rdi,%rdx), %rax + movq %rax, -136(%rbp) + jmp .L816 + .p2align 4,,10 + .p2align 3 +.L984: + leaq (%rdx,%rcx), %r8 + leaq (%rax,%rcx,8), %r10 + addq $8, %rcx +.L884: + vpcmpq $6, %zmm0, %zmm15, %k3 + xorl %esi, %esi + kmovb %k3, %edi + vpbroadcastq (%r14,%rdi,8), %zmm1 + vpcmpq $6, %zmm0, %zmm14, %k3 + popcntq %rdi, %rsi + subq %rsi, %rcx + vpsrlvq %zmm5, %zmm1, %zmm1 + kmovb %k3, %edi + vpcmpq $6, %zmm0, %zmm13, %k3 + movq %rdi, %rsi + vpermq %zmm15, %zmm1, %zmm15 + vpbroadcastq (%r14,%rdi,8), %zmm1 + leaq -16(%rdx,%rcx), %rdi + popcntq %rsi, %rsi + vmovdqu64 %zmm15, (%r10) + vpsrlvq %zmm5, %zmm1, %zmm1 + vmovdqu64 %zmm15, -64(%rax,%r8,8) + vpermq %zmm14, %zmm1, %zmm14 + vmovdqu64 %zmm14, (%rax,%rcx,8) + subq %rsi, %rcx + vmovdqu64 %zmm14, (%rax,%rdi,8) + kmovb %k3, %edi + addq $8, %rcx + vpbroadcastq (%r14,%rdi,8), %zmm1 + movq %rdi, %rsi + vpcmpq $6, %zmm0, %zmm12, %k3 + leaq -24(%rdx,%rcx), %rdi + vpsrlvq %zmm5, %zmm1, %zmm1 + vpermq %zmm13, %zmm1, %zmm13 + vmovdqu64 %zmm13, (%rax,%rcx,8) + vmovdqu64 %zmm13, (%rax,%rdi,8) + xorl %edi, %edi + popcntq %rsi, %rdi + movl $8, %esi + movq %rsi, %r8 + subq %rdi, %r8 + kmovb %k3, %edi + vpbroadcastq (%r14,%rdi,8), %zmm1 + vpcmpq $6, %zmm0, %zmm11, %k3 + addq %rcx, %r8 + movq %rdi, %rcx + vpsrlvq %zmm5, %zmm1, %zmm1 + leaq -32(%rdx,%r8), %rdi + popcntq %rcx, %rcx + vpermq %zmm12, %zmm1, %zmm12 + vmovdqu64 %zmm12, (%rax,%r8,8) + vmovdqu64 %zmm12, (%rax,%rdi,8) + movq %rsi, %rdi + subq %rcx, %rdi + addq %r8, %rdi + kmovb %k3, %r8d + vpbroadcastq (%r14,%r8,8), %zmm1 + movq %r8, %rcx + vpcmpq $6, %zmm0, %zmm10, %k3 + leaq -40(%rdx,%rdi), %r8 + popcntq %rcx, %rcx + vpsrlvq %zmm5, %zmm1, %zmm1 + vpermq %zmm11, %zmm1, %zmm11 + vmovdqu64 %zmm11, (%rax,%rdi,8) + vmovdqu64 %zmm11, (%rax,%r8,8) + movq %rsi, %r8 + subq %rcx, %r8 + addq %rdi, %r8 + kmovb %k3, %edi + vpbroadcastq (%r14,%rdi,8), %zmm1 + movq %rdi, %rcx + vpcmpq $6, %zmm0, %zmm9, %k3 + leaq -48(%rdx,%r8), %rdi + popcntq %rcx, %rcx + vpsrlvq %zmm5, %zmm1, %zmm1 + vpermq %zmm10, %zmm1, %zmm10 + vmovdqu64 %zmm10, (%rax,%r8,8) + vmovdqu64 %zmm10, (%rax,%rdi,8) + movq %rsi, %rdi + subq %rcx, %rdi + addq %r8, %rdi + kmovb %k3, %r8d + vpbroadcastq (%r14,%r8,8), %zmm1 + movq %r8, %rcx + vpcmpq $6, %zmm0, %zmm8, %k3 + leaq -56(%rdx,%rdi), %r8 + popcntq %rcx, %rcx + vpsrlvq %zmm5, %zmm1, %zmm1 + vpermq %zmm9, %zmm1, %zmm9 + vmovdqu64 %zmm9, (%rax,%rdi,8) + vmovdqu64 %zmm9, (%rax,%r8,8) + movq %rsi, %r8 + subq %rcx, %r8 + xorl %ecx, %ecx + addq %rdi, %r8 + kmovb %k3, %edi + vpbroadcastq (%r14,%rdi,8), %zmm1 + popcntq %rdi, %rcx + leaq -64(%rdx,%r8), %rdx + subq %rcx, %rsi + vpsrlvq %zmm5, %zmm1, %zmm1 + vpermq %zmm8, %zmm1, %zmm8 + vmovdqu64 %zmm8, (%rax,%r8,8) + vmovdqu64 %zmm8, (%rax,%rdx,8) + leaq (%rsi,%r8), %rdx + movq -144(%rbp), %rsi + leaq (%rax,%rdx,8), %rcx + subq %rdx, %rsi +.L883: + movq %rcx, %rdi + cmpq $7, %rsi + ja .L888 + movq -144(%rbp), %rdi + leaq -64(%rax,%rdi,8), %rdi +.L888: + vmovdqu64 (%rdi), %zmm7 + vpcmpq $6, %zmm0, %zmm6, %k2 + vmovq %xmm17, %rsi + vmovdqu64 %zmm7, (%rsi) + knotb %k2, %k1 + vpcompressq %zmm6, (%rcx){%k1} + kmovb %k1, %ecx + popcntq %rcx, %rcx + addq %rcx, %rdx + vpcompressq %zmm6, (%rax,%rdx,8){%k2} + leaq (%r9,%rdx), %r14 + movq -120(%rbp), %r9 + subq $1, %r9 + cmpl $2, -136(%rbp) + je .L986 + movq -128(%rbp), %rsi + movq %rbx, %r8 + movq %r12, %rcx + movq %r14, %rdx + movq %r13, %rdi + movq %r9, -120(%rbp) + vzeroupper + call _ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + cmpl $3, -136(%rbp) + movq -120(%rbp), %r9 + je .L972 +.L890: + movq %r15, %rdx + movq -128(%rbp), %rsi + leaq 0(%r13,%r14,8), %rdi + movq %rbx, %r8 + subq %r14, %rdx + movq %r12, %rcx + call _ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L972: + subq $-128, %rsp + popq %rbx + popq %r10 + .cfi_remember_state + .cfi_def_cfa 10, 0 + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + leaq -8(%r10), %rsp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L879: + .cfi_restore_state + movq (%r12), %rcx + leaq 8(%r11), %rdi + andq $-8, %rdi + movq %rcx, (%r11) + movl %r8d, %ecx + movq -8(%r12,%rcx), %rsi + movq %rsi, -8(%r11,%rcx) + movq %r11, %rcx + movq %r12, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r8d, %ecx + shrl $3, %ecx + rep movsq + jmp .L880 + .p2align 4,,10 + .p2align 3 +.L875: + movq (%r11), %rcx + leaq 8(%rax), %rdi + andq $-8, %rdi + movq %rcx, (%rax) + movl %r8d, %ecx + movq -8(%r11,%rcx), %rsi + movq %rsi, -8(%rax,%rcx) + movq %rax, %rcx + movq %r11, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r8d, %ecx + shrl $3, %ecx + rep movsq + jmp .L876 + .p2align 4,,10 + .p2align 3 +.L981: + subq %rsi, %r9 + leaq 0(%r13,%rsi,8), %r11 + leaq (%r12,%r8), %rsi +.L893: + movq $-1, %rdi + bzhi %r9, %rdi, %rdi + movzbl %dil, %edi + kmovd %edi, %k0 +.L874: + vmovdqu64 (%r11), %zmm1 + vpcmpq $6, %zmm0, %zmm1, %k1 + kandnb %k0, %k1, %k2 + vpcompressq %zmm1, (%rax){%k2} + kmovb %k2, %edi + kandb %k0, %k1, %k1 + popcntq %rdi, %rdi + leaq (%rax,%rdi,8), %rax + kmovb %k1, %r8d + popcntq %r8, %r8 + movq %rax, %r9 + addq %rcx, %r8 + movq %rdx, %rcx + movq %rdx, %rdi + subq %r13, %r9 + subq %r8, %rcx + vpcompressq %zmm1, (%rsi){%k1} + salq $3, %r8 + sarq $3, %r9 + movq %rcx, %rdx + leaq 0(%r13,%rcx,8), %r11 + subq %r9, %rdi + subq %r9, %rdx + movq %rdi, -144(%rbp) + leaq (%rax,%rdi,8), %rdi + leaq (%rax,%rdx,8), %r10 + vmovq %rdi, %xmm17 + jmp .L873 + .p2align 4,,10 + .p2align 3 +.L983: + movzbl (%r12), %ecx + movb %cl, (%r11) + jmp .L880 + .p2align 4,,10 + .p2align 3 +.L982: + movzbl (%r11), %ecx + movb %cl, (%rax) + jmp .L876 + .p2align 4,,10 + .p2align 3 +.L869: + leaq -8(%r9), %rsi + leaq 1(%r9), %rdi + andq $-8, %rsi + leaq 0(,%rcx,8), %r8 + addq $8, %rsi + cmpq $8, %rdi + movl $8, %edi + cmovbe %rdi, %rsi + cmpq %rsi, %r9 + je .L871 + subq %rsi, %r9 + leaq 0(%r13,%rsi,8), %r11 + leaq (%r12,%r8), %rsi + cmpq $255, %r9 + jbe .L893 + movl $255, %edi + kmovd %edi, %k0 + jmp .L874 + .p2align 4,,10 + .p2align 3 +.L977: + cmpq $1, %rdx + jbe .L972 + leaq 1024(%rdi), %rax + cmpq %rax, %rsi + jb .L987 + movl $8, %esi + call _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + jmp .L972 + .p2align 4,,10 + .p2align 3 +.L818: + vmovdqu64 0(%r13), %zmm6 + movl $8, %esi + movq $-1, %rax + subq -144(%rbp), %rsi + bzhi %rsi, %rax, %rax + kmovb %eax, %k2 + vpcmpq $4, %zmm5, %zmm6, %k0 + kandb %k2, %k0, %k0 + kmovb %k0, %eax + kortestb %k0, %k0 + jne .L988 + vpxor %xmm4, %xmm4, %xmm4 + leaq 1024(%r13,%rsi,8), %rdi + vmovdqa64 %zmm4, %zmm3 + .p2align 4,,10 + .p2align 3 +.L824: + movq %rsi, %rcx + subq $-128, %rsi + cmpq %rsi, %r15 + jb .L828 + leaq -1024(%rdi), %rax +.L823: + vpxord (%rax), %zmm2, %zmm0 + vpxord 64(%rax), %zmm2, %zmm1 + leaq 128(%rax), %rdx + vpord %zmm3, %zmm0, %zmm3 + vpord %zmm4, %zmm1, %zmm4 + vpxord 128(%rax), %zmm2, %zmm0 + vpxord 192(%rax), %zmm2, %zmm1 + vpord %zmm3, %zmm0, %zmm3 + vpxord 256(%rax), %zmm2, %zmm0 + vpord %zmm4, %zmm1, %zmm4 + vpxord 320(%rax), %zmm2, %zmm1 + leaq 384(%rdx), %rax + vpord %zmm3, %zmm0, %zmm3 + vpxord 256(%rdx), %zmm2, %zmm0 + vpord %zmm4, %zmm1, %zmm4 + vpxord 320(%rdx), %zmm2, %zmm1 + vpord %zmm3, %zmm0, %zmm0 + vpord %zmm4, %zmm1, %zmm1 + vmovdqa64 %zmm0, %zmm3 + vmovdqa64 %zmm1, %zmm4 + cmpq %rax, %rdi + jne .L823 + vpord %zmm1, %zmm0, %zmm0 + leaq 1408(%rdx), %rdi + vptestnmq %zmm0, %zmm0, %k0 + kortestb %k0, %k0 + setc %al + testb %al, %al + jne .L824 + vmovdqa64 0(%r13,%rcx,8), %zmm6 + vpcmpq $4, %zmm5, %zmm6, %k0 + kortestb %k0, %k0 + jne .L826 + .p2align 4,,10 + .p2align 3 +.L825: + addq $8, %rcx + vmovdqa64 0(%r13,%rcx,8), %zmm6 + vpcmpq $4, %zmm5, %zmm6, %k0 + kortestb %k0, %k0 + je .L825 +.L826: + kmovb %k0, %eax + tzcntl %eax, %eax + addq %rcx, %rax +.L822: + vpbroadcastq 0(%r13,%rax,8), %zmm1 + leaq 0(%r13,%rax,8), %rdi + vpcmpq $6, %zmm5, %zmm1, %k0 + kortestb %k0, %k0 + jne .L831 + leaq -8(%r15), %rax + xorl %ecx, %ecx + jmp .L837 + .p2align 4,,10 + .p2align 3 +.L832: + kmovb %k0, %edx + popcntq %rdx, %rdx + addq %rdx, %rcx + leaq -8(%rax), %rdx + vmovdqu64 %zmm5, 0(%r13,%rax,8) + cmpq %rdx, %r15 + jbe .L989 + movq %rdx, %rax +.L837: + vmovdqu64 0(%r13,%rax,8), %zmm6 + vpcmpq $0, %zmm1, %zmm6, %k1 + vpcmpq $0, %zmm5, %zmm6, %k0 + kmovb %k1, %edx + kmovb %k0, %esi + korb %k0, %k1, %k1 + kortestb %k1, %k1 + jc .L832 + kmovb %edx, %k2 + kmovb %esi, %k3 + kxnorb %k3, %k2, %k2 + kmovb %k2, %edx + tzcntl %edx, %edx + leaq 8(%rax), %rsi + addq %rax, %rdx + addq $16, %rax + vpbroadcastq 0(%r13,%rdx,8), %zmm2 + movq %r15, %rdx + subq %rcx, %rdx + vmovdqa64 %zmm2, -112(%rbp) + cmpq %rdx, %rax + ja .L833 + .p2align 4,,10 + .p2align 3 +.L834: + vmovdqu64 %zmm1, -64(%r13,%rax,8) + movq %rax, %rsi + addq $8, %rax + cmpq %rax, %rdx + jnb .L834 +.L833: + subq %rsi, %rdx + leaq 0(%r13,%rsi,8), %rcx + movl $255, %eax + cmpq $255, %rdx + ja .L835 + movq $-1, %rax + bzhi %rdx, %rax, %rax + movzbl %al, %eax +.L835: + kmovb %eax, %k3 + vmovdqu64 %zmm1, (%rcx){%k3} +.L836: + vpbroadcastq (%r12), %zmm0 + vpcmpq $0, .LC8(%rip), %zmm0, %k0 + kortestb %k0, %k0 + jc .L916 + vpcmpq $0, .LC7(%rip), %zmm0, %k0 + kortestb %k0, %k0 + jc .L854 + vpminsq %zmm2, %zmm1, %zmm3 + vpcmpq $6, %zmm3, %zmm0, %k0 + kortestb %k0, %k0 + jne .L990 + vmovdqa64 %zmm0, %zmm1 + movl $128, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L849: + leaq (%rcx,%rax,8), %rdx + addq $1, %rax + vpminsq 0(%r13,%rdx,8), %zmm1, %zmm1 + cmpq $16, %rax + jne .L849 + vpcmpq $6, %zmm1, %zmm0, %k0 + kortestb %k0, %k0 + jne .L976 + leaq 128(%rsi), %rax + cmpq %rax, %r15 + jb .L856 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L849 + .p2align 4,,10 + .p2align 3 +.L828: + movq %rcx, %rdx + addq $8, %rcx + cmpq %rcx, %r15 + jb .L991 + vmovdqa64 -64(%r13,%rcx,8), %zmm6 + vpcmpq $4, %zmm5, %zmm6, %k0 + kmovb %k0, %eax + kortestb %k0, %k0 + je .L828 +.L974: + tzcntl %eax, %eax + addq %rdx, %rax + jmp .L822 + .p2align 4,,10 + .p2align 3 +.L868: + movq %r12, %rsi + movq %r13, %r11 + movq %r13, %rax + testq %r9, %r9 + jne .L893 + movq %rdx, -144(%rbp) + vmovq %r10, %xmm17 + jmp .L894 + .p2align 4,,10 + .p2align 3 +.L986: + vzeroupper + jmp .L890 + .p2align 4,,10 + .p2align 3 +.L930: + movq -144(%rbp), %rsi + movq %rax, %rcx + jmp .L883 + .p2align 4,,10 + .p2align 3 +.L931: + vmovdqa64 .LC9(%rip), %zmm5 + movq %rax, %r10 + movq %rdx, %r8 + movl $8, %ecx + leaq _ZZN3hwy6N_AVX3L11CompressNotIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array(%rip), %r14 + jmp .L884 +.L980: + leaq -1(%r15), %rbx + movq %rbx, %r12 + shrq %r12 + .p2align 4,,10 + .p2align 3 +.L866: + movq %r12, %rdx + movq %r15, %rsi + movq %r13, %rdi + call _ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + subq $1, %r12 + jnb .L866 + .p2align 4,,10 + .p2align 3 +.L867: + movq 0(%r13,%rbx,8), %rdx + movq 0(%r13), %rax + movq %rbx, %rsi + movq %r13, %rdi + movq %rdx, 0(%r13) + xorl %edx, %edx + movq %rax, 0(%r13,%rbx,8) + call _ZN3hwy6N_AVX36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + subq $1, %rbx + jne .L867 + jmp .L972 + .p2align 4,,10 + .p2align 3 +.L857: + vpcmpq $6, -64(%r13,%rsi,8), %zmm0, %k0 + kortestb %k0, %k0 + jne .L976 +.L856: + movq %rsi, %rax + addq $8, %rsi + cmpq %rsi, %r15 + jnb .L857 + cmpq %rax, %r15 + je .L916 + vpcmpq $6, -64(%r13,%r15,8), %zmm0, %k0 + xorl %eax, %eax + kortestb %k0, %k0 + sete %al + addl $1, %eax + movl %eax, -136(%rbp) + jmp .L858 +.L918: + movl $12, %eax + movl $11, %esi + jmp .L864 + .p2align 4,,10 + .p2align 3 +.L865: + cmpq $23, %rax + je .L975 +.L864: + movq %rax, %rcx + addq $1, %rax + cmpq (%r12,%rax,8), %rdx + je .L865 + movl $12, %edi + subq $11, %rcx + movq %rdx, %rax + subq %rsi, %rdi + cmpq %rdi, %rcx + jb .L863 +.L975: + movq (%r12,%rsi,8), %rax + jmp .L863 +.L919: + movl $9, %esi + movl $10, %eax + jmp .L864 +.L920: + movl $8, %esi + movl $9, %eax + jmp .L864 +.L921: + movl $7, %esi + movl $8, %eax + jmp .L864 +.L922: + movl $6, %esi + movl $7, %eax + jmp .L864 +.L923: + movl $5, %esi + movl $6, %eax + jmp .L864 +.L831: + movq %r15, %rsi + leaq -112(%rbp), %rdx + vmovdqa64 %zmm5, %zmm0 + movq %r12, %rcx + subq %rax, %rsi + call _ZN3hwy6N_AVX36detail22MaybePartitionTwoValueINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L972 + vmovdqa64 -112(%rbp), %zmm2 + jmp .L836 +.L924: + movl $4, %esi + movl $5, %eax + jmp .L864 +.L925: + movl $3, %esi + movl $4, %eax + jmp .L864 +.L926: + movl $2, %esi + movl $3, %eax + jmp .L864 +.L927: + movl $1, %esi + movl $2, %eax + jmp .L864 +.L979: + xorl %esi, %esi + movl $1, %eax + jmp .L864 +.L989: + movl $255, %edi + vmovdqu64 0(%r13), %zmm0 + kmovd %edi, %k2 + cmpq $255, %rax + ja .L838 + movq $-1, %rdx + bzhi %rax, %rdx, %rdx + movzbl %dl, %edi + kmovd %edi, %k2 +.L838: + vpcmpq $0, %zmm1, %zmm0, %k0 + vpcmpq $0, %zmm5, %zmm0, %k1 + movq %r15, %rsi + kandb %k2, %k1, %k1 + knotb %k2, %k2 + korb %k1, %k0, %k0 + korb %k2, %k0, %k0 + kortestb %k0, %k0 + setc %dl + subq %rcx, %rsi + testb %dl, %dl + je .L992 + kmovb %k1, %eax + popcntq %rax, %rax + subq %rax, %rsi + vmovdqu64 %zmm5, 0(%r13) + movq %rsi, %rdx + cmpq $7, %rsi + jbe .L843 + leaq -8(%rsi), %rcx + movq -152(%rbp), %rsi + movq %rcx, %rax + shrq $3, %rax + salq $6, %rax + leaq 64(%r13,%rax), %rax + .p2align 4,,10 + .p2align 3 +.L844: + vmovdqu64 %zmm1, (%rsi) + addq $64, %rsi + cmpq %rax, %rsi + jne .L844 + andq $-8, %rcx + vmovdqa64 %zmm1, (%r12) + leaq 8(%rcx), %rax + leaq 0(%r13,%rax,8), %r13 + subq %rax, %rdx + movl $255, %eax + kmovd %eax, %k1 + cmpq $255, %rdx + jbe .L895 +.L845: + vmovdqu64 (%r12), %zmm0{%k1}{z} + vmovdqu64 %zmm0, 0(%r13){%k1} + vzeroupper + jmp .L972 +.L861: + movl $11, %eax + movl $10, %esi + jmp .L864 +.L991: + leaq -8(%r15), %rdx + vmovdqu64 0(%r13,%rdx,8), %zmm6 + vpcmpq $4, %zmm5, %zmm6, %k0 + kmovb %k0, %eax + kortestb %k0, %k0 + jne .L974 + vzeroupper + jmp .L972 +.L853: + vmovdqu64 -64(%r13,%rsi,8), %zmm6 + vpcmpq $6, %zmm0, %zmm6, %k0 + kortestb %k0, %k0 + jne .L976 +.L852: + movq %rsi, %rax + addq $8, %rsi + cmpq %rsi, %r15 + jnb .L853 + cmpq %rax, %r15 + je .L854 + vmovdqu64 -64(%r13,%r15,8), %zmm6 + vpcmpq $6, %zmm0, %zmm6, %k0 + kortestb %k0, %k0 + jne .L976 +.L854: + movl $3, -136(%rbp) + vpternlogd $0xFF, %zmm1, %zmm1, %zmm1 + vpaddq %zmm1, %zmm0, %zmm0 + jmp .L858 +.L988: + tzcntl %eax, %eax + jmp .L822 +.L987: + movq %rdi, %rcx + movq %r12, %rdi + cmpq $7, %rdx + jbe .L809 + leaq -8(%rdx), %rdx + movq 0(%r13), %rcx + leaq 8(%r12), %rdi + movq %rdx, %rax + andq $-8, %rdi + shrq $3, %rax + movq %rcx, (%r12) + addq $1, %rax + salq $6, %rax + movl %eax, %ecx + movq -8(%r13,%rcx), %rsi + movq %rsi, -8(%r12,%rcx) + movq %r12, %rcx + movq %r13, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %eax, %ecx + movq %rdx, %rax + movq %r15, %rdx + andq $-8, %rax + shrl $3, %ecx + rep movsq + addq $8, %rax + leaq 0(,%rax,8), %rcx + subq %rax, %rdx + movl $255, %eax + leaq (%r12,%rcx), %rdi + addq %r13, %rcx + cmpq $255, %rdx + jbe .L809 +.L810: + leal -1(%r15), %edx + movl $32, %r8d + movl $1, %esi + kmovb %eax, %k4 + bsrl %edx, %edx + vmovdqu64 (%rcx), %zmm0{%k4}{z} + movq %r15, %rax + xorl $31, %edx + subl %edx, %r8d + movl $1, %edx + vmovdqu64 %zmm0, (%rdi){%k4} + vpbroadcastq .LC10(%rip), %zmm0 + shlx %r8, %rsi, %rsi + shrq $4, %rsi + cmove %rdx, %rsi + movq %rsi, %rdx + salq $4, %rdx + addq $8, %rdx + cmpq %rdx, %r15 + jnb .L814 + .p2align 4,,10 + .p2align 3 +.L811: + vmovdqu64 %zmm0, (%r12,%rax,8) + addq $8, %rax + cmpq %rdx, %rax + jb .L811 +.L814: + movq %r12, %rdi + vzeroupper + call _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + cmpq $7, %r15 + jbe .L813 + leaq -8(%r15), %rdx + movq (%r12), %rcx + leaq 8(%r13), %rdi + movq %rdx, %rax + andq $-8, %rdi + shrq $3, %rax + movq %rcx, 0(%r13) + addq $1, %rax + salq $6, %rax + movl %eax, %ecx + movq -8(%r12,%rcx), %rsi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + movq %r12, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %eax, %ecx + movq %rdx, %rax + andq $-8, %rax + shrl $3, %ecx + rep movsq + addq $8, %rax + leaq 0(,%rax,8), %rdx + subq %rax, %r15 + movl $255, %eax + addq %rdx, %r13 + addq %rdx, %r12 + cmpq $255, %r15 + jbe .L813 +.L815: + kmovb %eax, %k6 + vmovdqu64 (%r12), %zmm0{%k6}{z} + vmovdqu64 %zmm0, 0(%r13){%k6} + vzeroupper + jmp .L972 +.L843: + vmovdqa64 %zmm1, (%r12) +.L895: + movq $-1, %rax + bzhi %rdx, %rax, %rax + movzbl %al, %eax + kmovd %eax, %k1 + jmp .L845 +.L916: + movl $2, -136(%rbp) + jmp .L858 +.L990: + vpmaxsq %zmm2, %zmm1, %zmm1 + vpcmpq $6, %zmm0, %zmm1, %k0 + kortestb %k0, %k0 + jne .L976 + vmovdqa64 %zmm0, %zmm1 + movl $128, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L850: + leaq (%rcx,%rax,8), %rdx + addq $1, %rax + vpmaxsq 0(%r13,%rdx,8), %zmm1, %zmm1 + cmpq $16, %rax + jne .L850 + vpcmpq $6, %zmm0, %zmm1, %k0 + kortestb %k0, %k0 + jne .L976 + leaq 128(%rsi), %rax + cmpq %rax, %r15 + jb .L852 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L850 +.L813: + movq $-1, %rax + bzhi %r15, %rax, %rax + movzbl %al, %eax + jmp .L815 +.L809: + movq $-1, %rax + bzhi %rdx, %rax, %rax + movzbl %al, %eax + jmp .L810 +.L992: + knotb %k0, %k2 + kmovb %k2, %edx + tzcntl %edx, %edx + vpbroadcastq 0(%r13,%rdx,8), %zmm2 + leaq 8(%rax), %rdx + vmovdqa64 %zmm2, -112(%rbp) + cmpq %rsi, %rdx + ja .L840 +.L841: + vmovdqu64 %zmm1, -64(%r13,%rdx,8) + movq %rdx, %rax + addq $8, %rdx + cmpq %rsi, %rdx + jbe .L841 +.L840: + subq %rax, %rsi + leaq 0(%r13,%rax,8), %rcx + movl $-1, %eax + cmpq $255, %rsi + ja .L842 + orq $-1, %rax + bzhi %rsi, %rax, %rax +.L842: + kmovb %eax, %k4 + vmovdqu64 %zmm1, (%rcx){%k4} + jmp .L836 + .cfi_endproc +.LFE18801: + .size _ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0, .-_ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + .section .text._ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0, @function +_ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0: +.LFB18803: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rdi, %r10 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + .cfi_offset 13, -40 + .cfi_offset 12, -48 + movq %rcx, %r12 + pushq %rbx + subq $104, %rsp + .cfi_offset 3, -56 + movq %rsi, -88(%rbp) + movq %rdx, -72(%rbp) + movq %r9, -80(%rbp) + cmpq $32, %rdx + jbe .L1219 + movq %rdi, %rax + movq %rdi, -128(%rbp) + movq %r8, %rbx + shrq $3, %rax + movq %rax, %rdx + movq %rax, -112(%rbp) + andl $7, %edx + jne .L1220 + movq -72(%rbp), %r14 + movq %rdi, %rax +.L1005: + movq 8(%rbx), %rdx + movq 16(%rbx), %r11 + movq %rdx, %rcx + leaq (%rdx,%rdx,8), %rsi + leaq 1(%r11), %r8 + movq %rdx, %rdi + rolq $24, %rcx + xorq (%rbx), %r8 + shrq $11, %rdi + leaq 2(%r11), %rdx + addq %r8, %rcx + xorq %rsi, %rdi + xorq %rdx, %rdi + movq %rcx, %rdx + leaq (%rcx,%rcx,8), %rsi + shrq $11, %rcx + rolq $24, %rdx + xorq %rsi, %rcx + leaq 3(%r11), %rsi + addq %rdi, %rdx + xorq %rsi, %rcx + movq %rdx, %rsi + leaq (%rdx,%rdx,8), %r9 + shrq $11, %rdx + rolq $24, %rsi + xorq %r9, %rdx + leaq 4(%r11), %r9 + addq $5, %r11 + addq %rcx, %rsi + xorq %r9, %rdx + movq %r11, 16(%rbx) + movq %rsi, %r9 + leaq (%rsi,%rsi,8), %r13 + shrq $11, %rsi + rolq $24, %r9 + xorq %r13, %rsi + addq %rdx, %r9 + xorq %r11, %rsi + movabsq $34359738359, %r11 + movq %r9, %r15 + leaq (%r9,%r9,8), %r13 + rolq $24, %r9 + shrq $11, %r15 + addq %rsi, %r9 + movl %esi, %esi + xorq %r15, %r13 + movl %edx, %r15d + movq %r13, %xmm0 + movl %r8d, %r13d + pinsrq $1, %r9, %xmm0 + movq %r14, %r9 + shrq $3, %r9 + cmpq %r11, %r14 + movl $4294967295, %r11d + movl %ecx, %r14d + cmova %r11, %r9 + shrq $32, %r8 + movl %edi, %r11d + movups %xmm0, (%rbx) + shrq $32, %rdi + imulq %r9, %r13 + shrq $32, %rcx + shrq $32, %rdx + imulq %r9, %r8 + imulq %r9, %r11 + shrq $32, %r13 + imulq %r9, %rdi + imulq %r9, %r14 + shrq $32, %r8 + imulq %r9, %rcx + shrq $32, %r11 + imulq %r9, %r15 + shrq $32, %rdi + imulq %r9, %rdx + shrq $32, %r14 + imulq %r9, %rsi + movq %r13, %r9 + shrq $32, %rcx + leaq 2(,%r13,8), %r13 + salq $6, %r9 + shrq $32, %r15 + movdqa (%rax,%r9), %xmm4 + movq %r8, %r9 + shrq $32, %rdx + salq $6, %r9 + shrq $32, %rsi + movdqa (%rax,%r9), %xmm3 + movq %r11, %r9 + salq $6, %r9 + movdqa (%rax,%r9), %xmm2 + movq %rdi, %r9 + salq $6, %r9 + movdqa %xmm2, %xmm0 + movdqa %xmm2, %xmm1 + movdqa (%rax,%r9), %xmm11 + movq %r14, %r9 + pcmpgtq %xmm4, %xmm0 + salq $6, %r9 + leaq 2(,%r14,8), %r14 + pblendvb %xmm0, %xmm4, %xmm1 + pblendvb %xmm0, %xmm2, %xmm4 + movdqa %xmm3, %xmm0 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm3, %xmm1 + movdqa (%rax,%r9), %xmm3 + movdqa %xmm4, %xmm0 + movq %rcx, %r9 + pcmpgtq %xmm1, %xmm0 + salq $6, %r9 + movdqa (%rax,%r9), %xmm2 + movq %r15, %r9 + leaq 2(,%r15,8), %r15 + salq $6, %r9 + movdqa (%rax,%r15,8), %xmm6 + pblendvb %xmm0, %xmm1, %xmm4 + movdqa %xmm2, %xmm0 + movdqa %xmm2, %xmm1 + movdqa (%rax,%r9), %xmm7 + pcmpgtq %xmm11, %xmm0 + movq %rdx, %r9 + movaps %xmm4, (%r12) + leaq 2(,%rdx,8), %rdx + salq $6, %r9 + pblendvb %xmm0, %xmm11, %xmm1 + pblendvb %xmm0, %xmm2, %xmm11 + movdqa %xmm3, %xmm0 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm3, %xmm1 + movdqa (%rax,%r9), %xmm3 + movdqa %xmm11, %xmm0 + movq %rsi, %r9 + pcmpgtq %xmm1, %xmm0 + salq $6, %r9 + leaq 2(,%rsi,8), %rsi + movdqa (%rax,%r9), %xmm2 + leaq 0(,%r13,8), %r9 + movdqa 16(%rax,%r9), %xmm13 + pblendvb %xmm0, %xmm1, %xmm11 + movdqa %xmm2, %xmm0 + movdqa %xmm2, %xmm1 + pcmpgtq %xmm7, %xmm0 + movaps %xmm11, 64(%r12) + pblendvb %xmm0, %xmm7, %xmm1 + pblendvb %xmm0, %xmm2, %xmm7 + movdqa %xmm3, %xmm0 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm3, %xmm1 + movdqa (%rax,%r13,8), %xmm3 + movdqa %xmm7, %xmm0 + leaq 2(,%r8,8), %r13 + movdqa (%rax,%r13,8), %xmm5 + leaq 0(,%r13,8), %r8 + pcmpgtq %xmm1, %xmm0 + leaq 2(,%r11,8), %r13 + movdqa (%rax,%r13,8), %xmm2 + leaq 0(,%r13,8), %r11 + leaq 2(,%rdi,8), %r13 + movdqa (%rax,%r13,8), %xmm10 + leaq 0(,%r13,8), %rdi + leaq 0(,%r14,8), %r13 + pblendvb %xmm0, %xmm1, %xmm7 + movdqa %xmm2, %xmm0 + movdqa %xmm2, %xmm1 + movaps %xmm7, 128(%r12) + pcmpgtq %xmm3, %xmm0 + pblendvb %xmm0, %xmm3, %xmm1 + pblendvb %xmm0, %xmm2, %xmm3 + movdqa %xmm5, %xmm0 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm5, %xmm1 + movdqa %xmm3, %xmm0 + movdqa (%rax,%r14,8), %xmm5 + leaq 2(,%rcx,8), %r14 + pcmpgtq %xmm1, %xmm0 + movdqa (%rax,%r14,8), %xmm2 + leaq 0(,%r14,8), %rcx + leaq 0(,%r15,8), %r14 + leaq 0(,%rdx,8), %r15 + pblendvb %xmm0, %xmm1, %xmm3 + movdqa %xmm2, %xmm0 + movdqa %xmm2, %xmm1 + pcmpgtq %xmm10, %xmm0 + movaps %xmm3, 16(%r12) + pblendvb %xmm0, %xmm10, %xmm1 + pblendvb %xmm0, %xmm2, %xmm10 + movdqa %xmm5, %xmm0 + movdqa (%rax,%rsi,8), %xmm2 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm5, %xmm1 + movdqa %xmm10, %xmm0 + movdqa (%rax,%rdx,8), %xmm5 + leaq 0(,%rsi,8), %rdx + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm1, %xmm10 + movdqa %xmm2, %xmm0 + movdqa %xmm2, %xmm1 + pcmpgtq %xmm6, %xmm0 + movaps %xmm10, 80(%r12) + pblendvb %xmm0, %xmm6, %xmm1 + pblendvb %xmm0, %xmm2, %xmm6 + movdqa %xmm5, %xmm0 + movdqa 16(%rax,%r11), %xmm2 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm5, %xmm1 + movdqa %xmm6, %xmm0 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm1, %xmm6 + movdqa %xmm2, %xmm0 + movdqa %xmm2, %xmm1 + movaps %xmm6, 144(%r12) + pcmpgtq %xmm13, %xmm0 + pblendvb %xmm0, %xmm13, %xmm1 + pblendvb %xmm0, %xmm2, %xmm13 + movdqa 16(%rax,%r8), %xmm0 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, 16(%rax,%r8), %xmm1 + movdqa %xmm13, %xmm0 + movdqa 16(%rax,%rcx), %xmm2 + movdqa 16(%rax,%rdi), %xmm9 + pcmpgtq %xmm1, %xmm0 + movdqa 16(%rax,%r14), %xmm5 + movdqa 32(%rax,%r9), %xmm12 + movdqa 32(%rax,%r8), %xmm8 + movdqa 32(%rax,%rcx), %xmm14 + movdqa 32(%rax,%r15), %xmm15 + pblendvb %xmm0, %xmm1, %xmm13 + movdqa %xmm2, %xmm0 + movdqa %xmm2, %xmm1 + pcmpgtq %xmm9, %xmm0 + movaps %xmm13, 32(%r12) + pblendvb %xmm0, %xmm9, %xmm1 + pblendvb %xmm0, %xmm2, %xmm9 + movdqa 16(%rax,%r13), %xmm0 + movdqa 16(%rax,%rdx), %xmm2 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, 16(%rax,%r13), %xmm1 + movdqa %xmm9, %xmm0 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm1, %xmm9 + movdqa %xmm2, %xmm0 + movdqa %xmm2, %xmm1 + pcmpgtq %xmm5, %xmm0 + movaps %xmm9, 96(%r12) + pblendvb %xmm0, %xmm5, %xmm1 + pblendvb %xmm0, %xmm2, %xmm5 + movdqa 16(%rax,%r15), %xmm0 + movdqa 32(%rax,%r11), %xmm2 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, 16(%rax,%r15), %xmm1 + movdqa %xmm5, %xmm0 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm1, %xmm5 + movdqa %xmm2, %xmm0 + movdqa %xmm2, %xmm1 + movaps %xmm5, 160(%r12) + pcmpgtq %xmm12, %xmm0 + pblendvb %xmm0, %xmm12, %xmm1 + pblendvb %xmm0, %xmm2, %xmm12 + movdqa %xmm8, %xmm0 + movdqa 32(%rax,%r13), %xmm2 + pcmpgtq %xmm1, %xmm0 + leaq 192(%r12), %r13 + pblendvb %xmm0, %xmm8, %xmm1 + movdqa %xmm12, %xmm0 + movdqa 32(%rax,%rdi), %xmm8 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm1, %xmm12 + movdqa %xmm14, %xmm0 + movdqa %xmm14, %xmm1 + pcmpgtq %xmm8, %xmm0 + movaps %xmm12, 48(%r12) + pblendvb %xmm0, %xmm8, %xmm1 + pblendvb %xmm0, %xmm14, %xmm8 + movdqa %xmm2, %xmm0 + movdqa 32(%rax,%rdx), %xmm14 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm2, %xmm1 + movdqa %xmm8, %xmm0 + movdqa %xmm14, %xmm2 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm1, %xmm8 + movdqa 32(%rax,%r14), %xmm1 + movdqa %xmm14, %xmm0 + movaps %xmm8, 112(%r12) + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm1, %xmm2 + pblendvb %xmm0, %xmm14, %xmm1 + movdqa %xmm15, %xmm0 + pcmpgtq %xmm2, %xmm0 + pblendvb %xmm0, %xmm15, %xmm2 + movdqa %xmm1, %xmm0 + pcmpgtq %xmm2, %xmm0 + pblendvb %xmm0, %xmm2, %xmm1 + movdqa %xmm4, %xmm2 + punpcklqdq %xmm2, %xmm2 + movdqa %xmm1, %xmm0 + movaps %xmm1, 176(%r12) + movdqa %xmm3, %xmm1 + pxor %xmm2, %xmm1 + pxor %xmm2, %xmm4 + pxor %xmm2, %xmm13 + movdqa 192(%r12), %xmm3 + por %xmm4, %xmm1 + pxor %xmm2, %xmm12 + pxor %xmm2, %xmm11 + por %xmm13, %xmm1 + pxor %xmm2, %xmm10 + pxor %xmm2, %xmm9 + por %xmm12, %xmm1 + pxor %xmm2, %xmm8 + pxor %xmm2, %xmm7 + por %xmm11, %xmm1 + pxor %xmm2, %xmm6 + pxor %xmm2, %xmm5 + por %xmm10, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm2, %xmm3 + por %xmm9, %xmm1 + por %xmm8, %xmm1 + por %xmm7, %xmm1 + por %xmm6, %xmm1 + por %xmm5, %xmm1 + por %xmm0, %xmm1 + pxor %xmm0, %xmm0 + por %xmm1, %xmm3 + pblendvb %xmm0, %xmm3, %xmm1 + pxor %xmm0, %xmm0 + pcmpeqq %xmm0, %xmm1 + movmskpd %xmm1, %eax + cmpl $3, %eax + je .L1007 + movdqa .LC4(%rip), %xmm0 + movl $2, %esi + movq %r12, %rdi + movq %r10, -112(%rbp) + movups %xmm0, 192(%r12) + movups %xmm0, 208(%r12) + movups %xmm0, 224(%r12) + movups %xmm0, 240(%r12) + movups %xmm0, 256(%r12) + call _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + pcmpeqd %xmm0, %xmm0 + movddup (%r12), %xmm2 + movddup 184(%r12), %xmm1 + paddq %xmm1, %xmm0 + movq -112(%rbp), %r10 + pcmpeqq %xmm2, %xmm0 + movmskpd %xmm0, %eax + cmpl $3, %eax + jne .L1009 + movq -72(%rbp), %rsi + movq %r10, %rdi + leaq -64(%rbp), %rdx + movq %r13, %rcx + movdqa %xmm2, %xmm0 + movq %r10, -112(%rbp) + call _ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + movq -112(%rbp), %r10 + testb %al, %al + jne .L993 +.L1009: + movq 96(%r12), %rdx + cmpq %rdx, 88(%r12) + jne .L1105 + cmpq 80(%r12), %rdx + jne .L1048 + cmpq 72(%r12), %rdx + jne .L1106 + cmpq 64(%r12), %rdx + jne .L1107 + cmpq 56(%r12), %rdx + jne .L1108 + cmpq 48(%r12), %rdx + jne .L1109 + cmpq 40(%r12), %rdx + jne .L1110 + cmpq 32(%r12), %rdx + jne .L1111 + cmpq 24(%r12), %rdx + jne .L1112 + cmpq 16(%r12), %rdx + jne .L1113 + cmpq 8(%r12), %rdx + jne .L1114 + movq (%r12), %rax + cmpq %rax, %rdx + jne .L1221 +.L1050: + movq %rax, %xmm2 + punpcklqdq %xmm2, %xmm2 +.L1217: + movl $1, -112(%rbp) +.L1046: + cmpq $0, -80(%rbp) + je .L1222 + movq -72(%rbp), %rax + leaq -2(%rax), %r9 + movq %r9, %rdx + movq %r9, %rsi + movdqu (%r10,%r9,8), %xmm15 + andl $7, %edx + andl $6, %esi + je .L1116 + movdqu (%r10), %xmm1 + pcmpeqd %xmm0, %xmm0 + xorl %ecx, %ecx + movdqa .LC0(%rip), %xmm8 + leaq _ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %r13 + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm4 + pcmpgtq %xmm2, %xmm3 + pxor %xmm3, %xmm0 + movmskpd %xmm0, %eax + popcntq %rax, %rcx + movq %rcx, %xmm7 + salq $4, %rax + movddup %xmm7, %xmm0 + pshufb 0(%r13,%rax), %xmm4 + pcmpgtq %xmm8, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L1056 + movq %xmm4, (%r10) +.L1056: + pextrq $1, %xmm0, %rax + testq %rax, %rax + je .L1057 + pextrq $1, %xmm4, 8(%r10) +.L1057: + movmskpd %xmm3, %esi + leaq (%r10,%rcx,8), %rax + xorl %ecx, %ecx + popcntq %rsi, %rcx + salq $4, %rsi + pshufb 0(%r13,%rsi), %xmm1 + movups %xmm1, (%r12) + testb $4, %r9b + je .L1058 + movdqu 16(%r10), %xmm1 + pcmpeqd %xmm0, %xmm0 + xorl %edi, %edi + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm4 + pcmpgtq %xmm2, %xmm3 + pxor %xmm3, %xmm0 + movmskpd %xmm0, %esi + popcntq %rsi, %rdi + movq %rdi, %xmm7 + salq $4, %rsi + movddup %xmm7, %xmm0 + pshufb 0(%r13,%rsi), %xmm4 + pcmpgtq %xmm8, %xmm0 + movq %xmm0, %rsi + testq %rsi, %rsi + je .L1059 + movq %xmm4, (%rax) +.L1059: + pextrq $1, %xmm0, %rsi + testq %rsi, %rsi + je .L1060 + pextrq $1, %xmm4, 8(%rax) +.L1060: + movmskpd %xmm3, %esi + leaq (%rax,%rdi,8), %rax + movq %rsi, %rdi + popcntq %rsi, %rsi + salq $4, %rdi + pshufb 0(%r13,%rdi), %xmm1 + movups %xmm1, (%r12,%rcx,8) + addq %rsi, %rcx + cmpq $5, %rdx + jbe .L1058 + movdqu 32(%r10), %xmm1 + pcmpeqd %xmm0, %xmm0 + xorl %edi, %edi + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm4 + pcmpgtq %xmm2, %xmm3 + pxor %xmm3, %xmm0 + movmskpd %xmm0, %esi + popcntq %rsi, %rdi + movq %rdi, %xmm7 + salq $4, %rsi + movddup %xmm7, %xmm0 + pshufb 0(%r13,%rsi), %xmm4 + pcmpgtq %xmm8, %xmm0 + movq %xmm0, %rsi + testq %rsi, %rsi + je .L1061 + movq %xmm4, (%rax) +.L1061: + pextrq $1, %xmm0, %rsi + testq %rsi, %rsi + je .L1062 + pextrq $1, %xmm4, 8(%rax) +.L1062: + movmskpd %xmm3, %esi + leaq (%rax,%rdi,8), %rax + movq %rsi, %rdi + popcntq %rsi, %rsi + salq $4, %rdi + pshufb 0(%r13,%rdi), %xmm1 + movups %xmm1, (%r12,%rcx,8) + addq %rsi, %rcx +.L1058: + leaq -2(%rdx), %rsi + leaq 1(%rdx), %rdi + andq $-2, %rsi + leaq 0(,%rcx,8), %r8 + addq $2, %rsi + cmpq $2, %rdi + movl $2, %edi + cmovbe %rdi, %rsi +.L1055: + cmpq %rsi, %rdx + je .L1063 + movdqu (%r10,%rsi,8), %xmm3 + subq %rsi, %rdx + xorl %esi, %esi + movq %rdx, %xmm0 + movdqa %xmm3, %xmm4 + punpcklqdq %xmm0, %xmm0 + movdqa %xmm3, %xmm5 + pcmpgtq %xmm2, %xmm4 + pcmpgtq %xmm8, %xmm0 + movdqa %xmm4, %xmm1 + pandn %xmm0, %xmm1 + movmskpd %xmm1, %edx + popcntq %rdx, %rsi + movq %rsi, %xmm7 + salq $4, %rdx + movddup %xmm7, %xmm1 + pshufb 0(%r13,%rdx), %xmm5 + pcmpgtq %xmm8, %xmm1 + movq %xmm1, %rdx + testq %rdx, %rdx + je .L1064 + movq %xmm5, (%rax) +.L1064: + pextrq $1, %xmm1, %rdx + testq %rdx, %rdx + je .L1065 + pextrq $1, %xmm5, 8(%rax) +.L1065: + pand %xmm0, %xmm4 + leaq (%rax,%rsi,8), %rax + movmskpd %xmm4, %edx + movq %rdx, %rsi + popcntq %rdx, %rdx + addq %rdx, %rcx + salq $4, %rsi + pshufb 0(%r13,%rsi), %xmm3 + movups %xmm3, (%r12,%r8) + leaq 0(,%rcx,8), %r8 +.L1063: + movq %r9, %rdx + subq %rcx, %rdx + leaq (%r10,%rdx,8), %r11 + cmpl $8, %r8d + jnb .L1066 + testl %r8d, %r8d + jne .L1223 +.L1067: + cmpl $8, %r8d + jnb .L1070 + testl %r8d, %r8d + jne .L1224 +.L1071: + movq %rax, %rcx + movq %r9, %r14 + subq %r10, %rcx + sarq $3, %rcx + subq %rcx, %r14 + subq %rcx, %rdx + movq %rcx, %r15 + leaq (%rax,%rdx,8), %rcx + je .L1117 + movdqu -32(%rcx), %xmm7 + leaq 64(%rax), %rsi + leaq -64(%rcx), %rdi + movdqu (%rax), %xmm14 + movdqu 16(%rax), %xmm13 + movdqu 32(%rax), %xmm12 + movaps %xmm7, -128(%rbp) + movdqu -16(%rcx), %xmm7 + movdqu 48(%rax), %xmm11 + movdqu -64(%rcx), %xmm10 + movdqu -48(%rcx), %xmm9 + movaps %xmm7, -144(%rbp) + cmpq %rdi, %rsi + je .L1118 + xorl %ecx, %ecx + movl $2, %r8d + jmp .L1078 + .p2align 4,,10 + .p2align 3 +.L1226: + movdqu -64(%rdi), %xmm5 + movdqu -48(%rdi), %xmm4 + prefetcht0 -256(%rdi) + subq $64, %rdi + movdqu 32(%rdi), %xmm3 + movdqu 48(%rdi), %xmm1 +.L1077: + movdqa %xmm5, %xmm6 + leaq -2(%rdx,%rcx), %r11 + pcmpgtq %xmm2, %xmm6 + movdqa %xmm6, %xmm7 + movdqa %xmm6, %xmm0 + movmskpd %xmm6, %r9d + punpcklqdq %xmm6, %xmm7 + punpckhqdq %xmm6, %xmm0 + popcntq %r9, %r9 + pandn %xmm7, %xmm0 + pshufd $78, %xmm5, %xmm7 + pblendvb %xmm0, %xmm7, %xmm5 + movups %xmm5, (%rax,%rcx,8) + addq $2, %rcx + movups %xmm5, (%rax,%r11,8) + movdqa %xmm4, %xmm5 + subq %r9, %rcx + pcmpgtq %xmm2, %xmm5 + leaq -4(%rdx,%rcx), %r11 + movdqa %xmm5, %xmm6 + movdqa %xmm5, %xmm0 + movmskpd %xmm5, %r9d + punpcklqdq %xmm5, %xmm6 + punpckhqdq %xmm5, %xmm0 + popcntq %r9, %r9 + pandn %xmm6, %xmm0 + pshufd $78, %xmm4, %xmm6 + pblendvb %xmm0, %xmm6, %xmm4 + movups %xmm4, (%rax,%rcx,8) + movups %xmm4, (%rax,%r11,8) + movdqa %xmm3, %xmm4 + movq %r8, %r11 + pcmpgtq %xmm2, %xmm4 + subq %r9, %r11 + addq %r11, %rcx + leaq -6(%rdx,%rcx), %r9 + subq $8, %rdx + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm0 + movmskpd %xmm4, %r11d + punpcklqdq %xmm4, %xmm5 + punpckhqdq %xmm4, %xmm0 + popcntq %r11, %r11 + pandn %xmm5, %xmm0 + pshufd $78, %xmm3, %xmm5 + pblendvb %xmm0, %xmm5, %xmm3 + movups %xmm3, (%rax,%rcx,8) + movups %xmm3, (%rax,%r9,8) + movdqa %xmm1, %xmm3 + movq %r8, %r9 + pcmpgtq %xmm2, %xmm3 + subq %r11, %r9 + addq %rcx, %r9 + leaq (%r9,%rdx), %rcx + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm0 + movmskpd %xmm3, %r11d + punpcklqdq %xmm3, %xmm4 + punpckhqdq %xmm3, %xmm0 + popcntq %r11, %r11 + pandn %xmm4, %xmm0 + pshufd $78, %xmm1, %xmm4 + pblendvb %xmm0, %xmm4, %xmm1 + movups %xmm1, (%rax,%r9,8) + movups %xmm1, (%rax,%rcx,8) + movq %r8, %rcx + subq %r11, %rcx + addq %r9, %rcx + cmpq %rdi, %rsi + je .L1225 +.L1078: + movq %rsi, %r9 + subq %rax, %r9 + sarq $3, %r9 + subq %rcx, %r9 + cmpq $8, %r9 + ja .L1226 + movdqu (%rsi), %xmm5 + movdqu 16(%rsi), %xmm4 + prefetcht0 256(%rsi) + addq $64, %rsi + movdqu -32(%rsi), %xmm3 + movdqu -16(%rsi), %xmm1 + jmp .L1077 + .p2align 4,,10 + .p2align 3 +.L1220: + movl $8, %eax + subq %rdx, %rax + leaq (%rdi,%rax,8), %rax + movq -72(%rbp), %rdi + leaq -8(%rdx,%rdi), %r14 + jmp .L1005 + .p2align 4,,10 + .p2align 3 +.L1225: + leaq 2(%rcx), %rsi + leaq (%rax,%rcx,8), %rdi + addq %rdx, %rcx +.L1075: + movdqa %xmm14, %xmm1 + movdqa -128(%rbp), %xmm7 + pcmpgtq %xmm2, %xmm1 + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm0 + movmskpd %xmm1, %r8d + punpcklqdq %xmm1, %xmm3 + punpckhqdq %xmm1, %xmm0 + movdqa %xmm13, %xmm1 + popcntq %r8, %r8 + pcmpgtq %xmm2, %xmm1 + pandn %xmm3, %xmm0 + pshufd $78, %xmm14, %xmm3 + subq %r8, %rsi + pblendvb %xmm0, %xmm3, %xmm14 + movups %xmm14, (%rdi) + leaq -4(%rdx,%rsi), %rdi + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm0 + movups %xmm14, -16(%rax,%rcx,8) + movmskpd %xmm1, %ecx + punpcklqdq %xmm1, %xmm3 + punpckhqdq %xmm1, %xmm0 + movdqa %xmm12, %xmm1 + pcmpgtq %xmm2, %xmm1 + pandn %xmm3, %xmm0 + pshufd $78, %xmm13, %xmm3 + pblendvb %xmm0, %xmm3, %xmm13 + movups %xmm13, (%rax,%rsi,8) + movdqa %xmm1, %xmm3 + movups %xmm13, (%rax,%rdi,8) + movdqa %xmm1, %xmm0 + xorl %edi, %edi + punpcklqdq %xmm1, %xmm3 + popcntq %rcx, %rdi + punpckhqdq %xmm1, %xmm0 + subq %rdi, %rsi + leaq 2(%rsi), %rcx + pandn %xmm3, %xmm0 + movmskpd %xmm1, %edi + pshufd $78, %xmm12, %xmm3 + movdqa %xmm11, %xmm1 + leaq -6(%rdx,%rcx), %rsi + popcntq %rdi, %rdi + pcmpgtq %xmm2, %xmm1 + pblendvb %xmm0, %xmm3, %xmm12 + movups %xmm12, (%rax,%rcx,8) + movups %xmm12, (%rax,%rsi,8) + movl $2, %esi + movq %rsi, %r8 + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm0 + subq %rdi, %r8 + punpcklqdq %xmm1, %xmm3 + punpckhqdq %xmm1, %xmm0 + addq %r8, %rcx + movmskpd %xmm1, %r8d + movdqa %xmm10, %xmm1 + pcmpgtq %xmm2, %xmm1 + pandn %xmm3, %xmm0 + pshufd $78, %xmm11, %xmm3 + popcntq %r8, %r8 + pblendvb %xmm0, %xmm3, %xmm11 + leaq -8(%rdx,%rcx), %rdi + movups %xmm11, (%rax,%rcx,8) + movdqa %xmm1, %xmm3 + movups %xmm11, (%rax,%rdi,8) + movdqa %xmm1, %xmm0 + movq %rsi, %rdi + punpcklqdq %xmm1, %xmm3 + subq %r8, %rdi + punpckhqdq %xmm1, %xmm0 + movmskpd %xmm1, %r8d + movdqa %xmm9, %xmm1 + pandn %xmm3, %xmm0 + addq %rcx, %rdi + pcmpgtq %xmm2, %xmm1 + pshufd $78, %xmm10, %xmm3 + leaq -10(%rdx,%rdi), %rcx + popcntq %r8, %r8 + pblendvb %xmm0, %xmm3, %xmm10 + movups %xmm10, (%rax,%rdi,8) + movdqa %xmm1, %xmm3 + movups %xmm10, (%rax,%rcx,8) + movdqa %xmm1, %xmm0 + movq %rsi, %rcx + punpcklqdq %xmm1, %xmm3 + subq %r8, %rcx + punpckhqdq %xmm1, %xmm0 + movmskpd %xmm1, %r8d + movdqa %xmm7, %xmm1 + pandn %xmm3, %xmm0 + addq %rdi, %rcx + pcmpgtq %xmm2, %xmm1 + pshufd $78, %xmm9, %xmm3 + leaq -12(%rdx,%rcx), %rdi + popcntq %r8, %r8 + pblendvb %xmm0, %xmm3, %xmm9 + movups %xmm9, (%rax,%rcx,8) + movdqa %xmm1, %xmm3 + movups %xmm9, (%rax,%rdi,8) + movdqa %xmm1, %xmm0 + movq %rsi, %rdi + punpcklqdq %xmm1, %xmm3 + subq %r8, %rdi + punpckhqdq %xmm1, %xmm0 + addq %rcx, %rdi + pandn %xmm3, %xmm0 + pshufd $78, %xmm7, %xmm3 + pblendvb %xmm0, %xmm3, %xmm7 + leaq -14(%rdx,%rdi), %rcx + movmskpd %xmm1, %r8d + movups %xmm7, (%rax,%rdi,8) + popcntq %r8, %r8 + movups %xmm7, (%rax,%rcx,8) + movdqa -144(%rbp), %xmm7 + movq %rsi, %rcx + subq %r8, %rcx + movdqa %xmm7, %xmm1 + addq %rdi, %rcx + pcmpgtq %xmm2, %xmm1 + leaq -16(%rdx,%rcx), %rdx + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm0 + movmskpd %xmm1, %edi + punpcklqdq %xmm1, %xmm3 + punpckhqdq %xmm1, %xmm0 + popcntq %rdi, %rdi + subq %rdi, %rsi + pandn %xmm3, %xmm0 + pshufd $78, %xmm7, %xmm3 + movq %r14, %rdi + pblendvb %xmm0, %xmm3, %xmm7 + movups %xmm7, (%rax,%rcx,8) + movups %xmm7, (%rax,%rdx,8) + leaq (%rsi,%rcx), %rdx + subq %rdx, %rdi + leaq 0(,%rdx,8), %rsi +.L1074: + movdqa %xmm15, %xmm1 + cmpq $2, %rdi + leaq -16(,%r14,8), %rcx + pcmpgtq %xmm2, %xmm1 + cmovnb %rsi, %rcx + pcmpeqd %xmm0, %xmm0 + movdqa %xmm15, %xmm2 + movdqu (%rax,%rcx), %xmm7 + xorl %ecx, %ecx + pxor %xmm1, %xmm0 + movmskpd %xmm0, %edi + movups %xmm7, (%rax,%r14,8) + popcntq %rdi, %rcx + movq %rcx, %xmm7 + salq $4, %rdi + movddup %xmm7, %xmm0 + pshufb 0(%r13,%rdi), %xmm2 + pcmpgtq %xmm8, %xmm0 + movq %xmm0, %rdi + testq %rdi, %rdi + je .L1080 + movq %xmm2, (%rax,%rsi) +.L1080: + pextrq $1, %xmm0, %rdi + testq %rdi, %rdi + je .L1081 + pextrq $1, %xmm2, 8(%rax,%rsi) +.L1081: + addq %rdx, %rcx + movmskpd %xmm1, %edx + movq %rdx, %rdi + leaq 0(,%rcx,8), %rsi + salq $4, %rdi + pshufb 0(%r13,%rdi), %xmm15 + xorl %edi, %edi + popcntq %rdx, %rdi + movq %rdi, %xmm0 + punpcklqdq %xmm0, %xmm0 + pcmpgtq %xmm8, %xmm0 + movq %xmm0, %rdx + testq %rdx, %rdx + je .L1082 + movq %xmm15, (%rax,%rcx,8) +.L1082: + pextrq $1, %xmm0, %rdx + testq %rdx, %rdx + je .L1083 + pextrq $1, %xmm15, 8(%rax,%rsi) +.L1083: + movq -80(%rbp), %r14 + addq %rcx, %r15 + subq $1, %r14 + cmpl $2, -112(%rbp) + je .L1085 + movq -88(%rbp), %rsi + movq %r10, %rdi + movq %r14, %r9 + movq %rbx, %r8 + movq %r12, %rcx + movq %r15, %rdx + movq %r10, -80(%rbp) + call _ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + cmpl $3, -112(%rbp) + movq -80(%rbp), %r10 + je .L993 +.L1085: + movq -72(%rbp), %rdx + movq -88(%rbp), %rsi + leaq (%r10,%r15,8), %rdi + movq %r14, %r9 + movq %rbx, %r8 + movq %r12, %rcx + subq %r15, %rdx + call _ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L993: + addq $104, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1070: + .cfi_restore_state + movq (%r12), %rcx + leaq 8(%r11), %rdi + andq $-8, %rdi + movq %rcx, (%r11) + movl %r8d, %ecx + movq -8(%r12,%rcx), %rsi + movq %rsi, -8(%r11,%rcx) + subq %rdi, %r11 + movq %r12, %rsi + leal (%r8,%r11), %ecx + subq %r11, %rsi + shrl $3, %ecx + rep movsq + jmp .L1071 + .p2align 4,,10 + .p2align 3 +.L1066: + movq (%r11), %rcx + leaq 8(%rax), %rdi + andq $-8, %rdi + movq %rcx, (%rax) + movl %r8d, %ecx + movq -8(%r11,%rcx), %rsi + movq %rsi, -8(%rax,%rcx) + movq %rax, %rcx + movq %r11, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r8d, %ecx + shrl $3, %ecx + rep movsq + jmp .L1067 +.L1224: + movzbl (%r12), %ecx + movb %cl, (%r11) + jmp .L1071 +.L1223: + movzbl (%r11), %ecx + movb %cl, (%rax) + jmp .L1067 +.L1219: + cmpq $1, %rdx + jbe .L993 + leaq 256(%rdi), %rax + cmpq %rax, %rsi + jb .L997 + movl $2, %esi + call _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + jmp .L993 +.L1007: + movq -112(%rbp), %rax + movl $2, %edi + movdqu (%r10), %xmm0 + movdqa .LC0(%rip), %xmm8 + andl $1, %eax + pcmpeqq %xmm2, %xmm0 + subq %rax, %rdi + movq %rdi, %xmm7 + movddup %xmm7, %xmm1 + pcmpgtq %xmm8, %xmm1 + pandn %xmm1, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1227 + pxor %xmm1, %xmm1 + movq -72(%rbp), %r8 + leaq 256(%r10,%rdi,8), %rsi + movdqa %xmm1, %xmm0 + movdqa %xmm1, %xmm4 + .p2align 4,,10 + .p2align 3 +.L1013: + movq %rdi, %rcx + leaq 32(%rdi), %rdi + cmpq %rdi, %r8 + jb .L1228 + leaq -256(%rsi), %rax +.L1012: + movdqa (%rax), %xmm3 + leaq 32(%rax), %rdx + pxor %xmm2, %xmm3 + por %xmm3, %xmm0 + movdqa 16(%rax), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm1 + movdqa 32(%rax), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm0 + movdqa 48(%rax), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm1 + movdqa 64(%rax), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm0 + movdqa 80(%rax), %xmm3 + leaq 96(%rdx), %rax + pxor %xmm2, %xmm3 + por %xmm3, %xmm1 + movdqa 64(%rdx), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm0 + movdqa 80(%rdx), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm1 + cmpq %rsi, %rax + jne .L1012 + movdqa %xmm0, %xmm3 + leaq 352(%rdx), %rsi + por %xmm1, %xmm3 + pcmpeqq %xmm4, %xmm3 + movmskpd %xmm3, %eax + cmpl $3, %eax + je .L1013 + movdqa %xmm2, %xmm0 + pcmpeqd %xmm1, %xmm1 + pcmpeqq (%r10,%rcx,8), %xmm0 + pxor %xmm1, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1015 + .p2align 4,,10 + .p2align 3 +.L1014: + addq $2, %rcx + movdqa %xmm2, %xmm0 + pcmpeqq (%r10,%rcx,8), %xmm0 + pxor %xmm1, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + je .L1014 +.L1015: + rep bsfl %eax, %eax + cltq + addq %rcx, %rax +.L1011: + leaq (%r10,%rax,8), %r8 + movq (%r8), %rdi + movq %rdi, %xmm7 + movddup %xmm7, %xmm1 + movdqa %xmm1, %xmm0 + pcmpgtq %xmm2, %xmm0 + movmskpd %xmm0, %edx + testl %edx, %edx + jne .L1020 + movq -72(%rbp), %rsi + xorl %ecx, %ecx + leaq -2(%rsi), %rax + jmp .L1027 + .p2align 4,,10 + .p2align 3 +.L1021: + movmskpd %xmm0, %edx + movups %xmm2, (%r10,%rax,8) + popcntq %rdx, %rdx + addq %rdx, %rcx + leaq -2(%rax), %rdx + cmpq %rdx, %rsi + jbe .L1229 + movq %rdx, %rax +.L1027: + movdqu (%r10,%rax,8), %xmm3 + movdqu (%r10,%rax,8), %xmm0 + pcmpeqq %xmm1, %xmm3 + pcmpeqq %xmm2, %xmm0 + movdqa %xmm3, %xmm4 + por %xmm0, %xmm4 + movmskpd %xmm4, %edx + cmpl $3, %edx + je .L1021 + pcmpeqd %xmm4, %xmm4 + leaq 2(%rax), %rsi + pxor %xmm4, %xmm0 + pandn %xmm0, %xmm3 + movmskpd %xmm3, %edx + rep bsfl %edx, %edx + movslq %edx, %rdx + addq %rax, %rdx + addq $4, %rax + movddup (%r10,%rdx,8), %xmm3 + movq -72(%rbp), %rdx + movaps %xmm3, -64(%rbp) + subq %rcx, %rdx + cmpq %rax, %rdx + jb .L1022 + .p2align 4,,10 + .p2align 3 +.L1023: + movups %xmm1, -16(%r10,%rax,8) + movq %rax, %rsi + addq $2, %rax + cmpq %rdx, %rax + jbe .L1023 +.L1022: + subq %rsi, %rdx + leaq 0(,%rsi,8), %rcx + movq %rdx, %xmm0 + punpcklqdq %xmm0, %xmm0 + pcmpgtq %xmm8, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L1031 + movq %rdi, (%r10,%rsi,8) +.L1031: + pextrq $1, %xmm0, %rax + testq %rax, %rax + je .L1026 + movq %rdi, 8(%r10,%rcx) +.L1026: + movdqa %xmm2, %xmm0 + pcmpeqq .LC12(%rip), %xmm0 + movmskpd %xmm0, %eax + cmpl $3, %eax + je .L1103 + movdqa %xmm2, %xmm0 + pcmpeqq .LC4(%rip), %xmm0 + movmskpd %xmm0, %eax + movl %eax, -112(%rbp) + cmpl $3, %eax + je .L1230 + movdqa %xmm3, %xmm0 + movdqa %xmm3, %xmm5 + movdqa %xmm2, %xmm4 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm1, %xmm5 + pcmpgtq %xmm5, %xmm4 + movmskpd %xmm4, %eax + testl %eax, %eax + jne .L1231 + movdqa %xmm2, %xmm3 + movl $32, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1037: + leaq (%rcx,%rax,2), %rdx + addq $1, %rax + movdqu (%r10,%rdx,8), %xmm1 + movdqa %xmm1, %xmm0 + pcmpgtq %xmm3, %xmm0 + pblendvb %xmm0, %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + cmpq $16, %rax + jne .L1037 + movdqa %xmm2, %xmm0 + pcmpgtq %xmm1, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1217 + leaq 32(%rsi), %rax + cmpq %rax, -72(%rbp) + jb .L1232 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1037 +.L1116: + xorl %r8d, %r8d + xorl %ecx, %ecx + leaq _ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %r13 + movq %r10, %rax + movdqa .LC0(%rip), %xmm8 + jmp .L1055 +.L1118: + movq %rdx, %rcx + movq %rax, %rdi + movl $2, %esi + jmp .L1075 +.L1117: + xorl %esi, %esi + movq %r14, %rdi + jmp .L1074 +.L1222: + movq -72(%rbp), %rsi + movq %r10, %rdi + leaq -1(%rsi), %rbx + movq %rbx, %r12 + shrq %r12 + .p2align 4,,10 + .p2align 3 +.L1053: + movq %r12, %rdx + call _ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + subq $1, %r12 + jnb .L1053 + .p2align 4,,10 + .p2align 3 +.L1054: + movq (%rdi,%rbx,8), %rdx + movq (%rdi), %rax + movq %rbx, %rsi + movq %rdx, (%rdi) + xorl %edx, %edx + movq %rax, (%rdi,%rbx,8) + call _ZN3hwy6N_SSE46detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + subq $1, %rbx + jne .L1054 + jmp .L993 +.L1105: + movl $12, %eax + movl $11, %esi + jmp .L1051 + .p2align 4,,10 + .p2align 3 +.L1052: + cmpq $23, %rax + je .L1216 +.L1051: + movq %rax, %rcx + addq $1, %rax + cmpq (%r12,%rax,8), %rdx + je .L1052 + movl $12, %edi + subq $11, %rcx + movq %rdx, %rax + subq %rsi, %rdi + cmpq %rdi, %rcx + jb .L1050 +.L1216: + movq (%r12,%rsi,8), %rax + jmp .L1050 +.L1106: + movl $9, %esi + movl $10, %eax + jmp .L1051 +.L1107: + movl $8, %esi + movl $9, %eax + jmp .L1051 +.L1108: + movl $7, %esi + movl $8, %eax + jmp .L1051 +.L1109: + movl $6, %esi + movl $7, %eax + jmp .L1051 +.L1228: + movq -72(%rbp), %rsi + pcmpeqd %xmm1, %xmm1 + .p2align 4,,10 + .p2align 3 +.L1017: + movq %rcx, %rdx + addq $2, %rcx + cmpq %rcx, %rsi + jb .L1233 + movdqa %xmm2, %xmm0 + pcmpeqq -16(%r10,%rcx,8), %xmm0 + pxor %xmm1, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + je .L1017 +.L1214: + rep bsfl %eax, %eax + cltq + addq %rdx, %rax + jmp .L1011 +.L1110: + movl $5, %esi + movl $6, %eax + jmp .L1051 +.L1111: + movl $4, %esi + movl $5, %eax + jmp .L1051 +.L1020: + movq -72(%rbp), %rsi + leaq -64(%rbp), %rdx + movq %r12, %rcx + movdqa %xmm2, %xmm0 + movq %r8, %rdi + movq %r10, -128(%rbp) + subq %rax, %rsi + movaps %xmm1, -112(%rbp) + call _ZN3hwy6N_SSE46detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L993 + movdqa -64(%rbp), %xmm3 + movq -128(%rbp), %r10 + movddup (%r12), %xmm2 + movdqa -112(%rbp), %xmm1 + jmp .L1026 +.L1112: + movl $3, %esi + movl $4, %eax + jmp .L1051 +.L1113: + movl $2, %esi + movl $3, %eax + jmp .L1051 +.L1114: + movl $1, %esi + movl $2, %eax + jmp .L1051 +.L1221: + xorl %esi, %esi + movl $1, %eax + jmp .L1051 +.L1229: + movdqu (%r10), %xmm4 + movdqu (%r10), %xmm0 + movq %rax, %xmm7 + movddup %xmm7, %xmm3 + movq -72(%rbp), %rdx + pcmpeqq %xmm2, %xmm4 + pcmpeqq %xmm1, %xmm0 + pcmpgtq %xmm8, %xmm3 + subq %rcx, %rdx + movdqa %xmm4, %xmm5 + por %xmm4, %xmm0 + pcmpeqd %xmm4, %xmm4 + pand %xmm3, %xmm5 + pxor %xmm4, %xmm3 + por %xmm3, %xmm0 + movmskpd %xmm0, %esi + cmpl $3, %esi + jne .L1234 + movmskpd %xmm5, %ecx + movq %rdx, %rax + movups %xmm2, (%r10) + popcntq %rcx, %rcx + subq %rcx, %rax + cmpq $1, %rax + jbe .L1092 + leaq -2(%rax), %rcx + movq -128(%rbp), %rsi + movq %rcx, %rdx + shrq %rdx + salq $4, %rdx + leaq 16(%r10,%rdx), %rdx +.L1034: + movups %xmm1, (%rsi) + addq $16, %rsi + cmpq %rsi, %rdx + jne .L1034 + movq %rcx, %rdx + andq $-2, %rdx + addq $2, %rdx + leaq 0(,%rdx,8), %rcx + subq %rdx, %rax +.L1033: + movaps %xmm1, (%r12) + testq %rax, %rax + je .L993 + leaq (%r10,%rcx), %rdi + leaq 0(,%rax,8), %rdx + movq %r12, %rsi + call memcpy@PLT + jmp .L993 +.L1232: + movq -72(%rbp), %rdx + jmp .L1044 +.L1045: + movdqu -16(%r10,%rsi,8), %xmm7 + movdqa %xmm2, %xmm0 + pcmpgtq %xmm7, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1217 +.L1044: + movq %rsi, %rax + addq $2, %rsi + cmpq %rsi, %rdx + jnb .L1045 + movq -72(%rbp), %rdi + cmpq %rax, %rdi + je .L1103 + movdqu -16(%r10,%rdi,8), %xmm7 + movdqa %xmm2, %xmm0 + pcmpgtq %xmm7, %xmm0 + movaps %xmm7, -112(%rbp) + movmskpd %xmm0, %eax + cmpl $1, %eax + movl $1, %eax + adcl $0, %eax + movl %eax, -112(%rbp) + jmp .L1046 +.L1048: + movl $11, %eax + movl $10, %esi + jmp .L1051 +.L1233: + movq -72(%rbp), %rax + pcmpeqd %xmm1, %xmm1 + leaq -2(%rax), %rdx + movdqu (%r10,%rdx,8), %xmm0 + pcmpeqq %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + je .L993 + jmp .L1214 +.L1227: + rep bsfl %eax, %eax + cltq + jmp .L1011 +.L1103: + movl $2, -112(%rbp) + jmp .L1046 +.L997: + movq %rdx, %r11 + leaq -2(%rdx), %rdx + movq (%rdi), %rax + movq %r10, %rsi + movq %rdx, %rbx + andq $-2, %rdx + shrq %rbx + movq %rax, (%rcx) + leaq 2(%rdx), %r13 + addq $1, %rbx + salq $4, %rbx + movl %ebx, %r8d + leaq 8(%rdi,%r8), %r15 + leaq 8(%rcx,%r8), %r14 + movq -16(%r15), %rax + leaq 8(%rcx), %rdi + andq $-8, %rdi + movq %rax, -16(%r14) + movq %rcx, %rax + subq %rdi, %rax + subq %rax, %rsi + leal (%rbx,%rax), %ecx + movq %r11, %rax + shrl $3, %ecx + subq %r13, %rax + rep movsq + movq %r11, -72(%rbp) + movq %rax, %rsi + movq %rax, -80(%rbp) + je .L998 + leaq 0(,%r13,8), %rax + leaq 0(,%rsi,8), %rdx + movq %r8, -112(%rbp) + leaq (%r10,%rax), %rsi + leaq (%r12,%rax), %rdi + movq %r10, -88(%rbp) + call memcpy@PLT + movq -72(%rbp), %r11 + movl $32, %ecx + movq -88(%rbp), %r10 + movq -112(%rbp), %r8 + movl %r11d, %eax + subl $1, %eax + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rax + shrq $4, %rax + movq %rax, %rsi + movl $1, %eax + cmove %rax, %rsi + movq %rsi, %rax + salq $4, %rax + addq $2, %rax + cmpq %rax, %r11 + jnb .L1235 +.L999: + movdqa .LC4(%rip), %xmm0 + movq -72(%rbp), %rdx +.L1003: + movups %xmm0, (%r12,%rdx,8) + addq $2, %rdx + cmpq %rax, %rdx + jb .L1003 + movq %r12, %rdi + movq %r8, -88(%rbp) + movq %r10, -72(%rbp) + call _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq -72(%rbp), %r10 + movq (%r12), %rax + movq %r12, %rsi + movq %rax, (%r10) + movq -88(%rbp), %r8 + leaq 8(%r10), %rdi + andq $-8, %rdi + movq -8(%r12,%r8), %rax + movq %rax, -8(%r10,%r8) + movq %r10, %rax + subq %rdi, %rax + leal (%rbx,%rax), %ecx + subq %rax, %rsi + shrl $3, %ecx + rep movsq + cmpq $0, -80(%rbp) + je .L993 +.L1001: + movq -80(%rbp), %rdx + salq $3, %r13 + leaq (%r10,%r13), %rdi + leaq (%r12,%r13), %rsi + salq $3, %rdx + call memcpy@PLT + jmp .L993 +.L1230: + pcmpeqd %xmm0, %xmm0 + paddq %xmm0, %xmm2 + jmp .L1046 +.L1231: + movdqa %xmm1, %xmm7 + pblendvb %xmm0, %xmm3, %xmm7 + movdqa %xmm7, %xmm0 + pcmpgtq %xmm2, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1217 + movdqa %xmm2, %xmm1 + movl $32, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1038: + leaq (%rcx,%rax,2), %rdx + movdqa %xmm1, %xmm7 + addq $1, %rax + movdqu (%r10,%rdx,8), %xmm3 + movdqa %xmm3, %xmm0 + pcmpgtq %xmm1, %xmm0 + pblendvb %xmm0, %xmm3, %xmm7 + movdqa %xmm7, %xmm0 + movdqa %xmm7, %xmm1 + cmpq $16, %rax + jne .L1038 + pcmpgtq %xmm2, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1217 + leaq 32(%rsi), %rax + cmpq %rax, -72(%rbp) + jb .L1040 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1038 +.L1041: + movdqu -16(%r10,%rsi,8), %xmm0 + pcmpgtq %xmm2, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1217 +.L1040: + movq %rsi, %rax + addq $2, %rsi + cmpq %rsi, -72(%rbp) + jnb .L1041 + movq -72(%rbp), %rdi + cmpq %rax, %rdi + je .L1042 + movdqu -16(%r10,%rdi,8), %xmm7 + movdqa %xmm7, %xmm0 + movaps %xmm7, -112(%rbp) + pcmpgtq %xmm2, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1217 +.L1042: + pcmpeqd %xmm0, %xmm0 + movl $3, -112(%rbp) + paddq %xmm0, %xmm2 + jmp .L1046 +.L998: + movq -72(%rbp), %rdi + movl $32, %ecx + movl %edi, %eax + subl $1, %eax + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rax + shrq $4, %rax + movq %rax, %rsi + movl $1, %eax + cmove %rax, %rsi + movq %rsi, %rax + salq $4, %rax + addq $2, %rax + cmpq %rax, %rdi + jb .L999 + movq %r12, %rdi + movq %r10, -72(%rbp) + call _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq -72(%rbp), %r10 + movq (%r12), %rax + movq %r12, %rsi + leaq 8(%r10), %rdi + movq %rax, (%r10) + movq -16(%r14), %rax + andq $-8, %rdi + subq %rdi, %r10 + movq %rax, -16(%r15) + leal (%rbx,%r10), %ecx + subq %r10, %rsi + shrl $3, %ecx + rep movsq + jmp .L993 + .p2align 4,,10 + .p2align 3 +.L1092: + xorl %ecx, %ecx + jmp .L1033 +.L1234: + pxor %xmm4, %xmm0 + movmskpd %xmm0, %ecx + rep bsfl %ecx, %ecx + movslq %ecx, %rcx + movddup (%r10,%rcx,8), %xmm3 + leaq 2(%rax), %rcx + movaps %xmm3, -64(%rbp) + cmpq %rdx, %rcx + ja .L1029 +.L1030: + movups %xmm1, -16(%r10,%rcx,8) + movq %rcx, %rax + addq $2, %rcx + cmpq %rdx, %rcx + jbe .L1030 +.L1029: + subq %rax, %rdx + leaq 0(,%rax,8), %rcx + movq %rdx, %xmm0 + punpcklqdq %xmm0, %xmm0 + pcmpgtq %xmm8, %xmm0 + movq %xmm0, %rdx + testq %rdx, %rdx + je .L1031 + movq %rdi, (%r10,%rax,8) + jmp .L1031 +.L1235: + movq %r12, %rdi + movq %r10, -72(%rbp) + call _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq -72(%rbp), %r10 + movq (%r12), %rax + movq %r12, %rsi + movq %rax, (%r10) + movq -16(%r14), %rax + leaq 8(%r10), %rdi + andq $-8, %rdi + movq %rax, -16(%r15) + movq %r10, %rax + subq %rdi, %rax + leal (%rbx,%rax), %ecx + subq %rax, %rsi + shrl $3, %ecx + rep movsq + jmp .L1001 + .cfi_endproc +.LFE18803: + .size _ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0, .-_ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + .section .text._ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0, @function +_ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0: +.LFB18805: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + pushq %r13 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + .cfi_offset 13, -40 + movq %rcx, %r13 + pushq %r12 + pushq %rbx + subq $392, %rsp + .cfi_offset 12, -48 + .cfi_offset 3, -56 + movq %rdi, -104(%rbp) + movq %rsi, -264(%rbp) + movq %rdx, -216(%rbp) + movq %r8, -192(%rbp) + movq %r9, -224(%rbp) + cmpq $32, %rdx + jbe .L1463 + movq %rdi, %r10 + movq %rdi, %r12 + shrq $3, %r10 + movq %r10, %rax + andl $7, %eax + jne .L1464 + movq %rdx, %r14 + movq %rdi, %r15 + movq %r8, %rax +.L1248: + movq 8(%rax), %rdx + movq 16(%rax), %r9 + movq %rdx, %rsi + leaq 1(%r9), %rdi + leaq (%rdx,%rdx,8), %rcx + xorq (%rax), %rdi + rolq $24, %rsi + movq %rsi, %rax + movq %rdx, %rsi + leaq 2(%r9), %rdx + addq %rdi, %rax + shrq $11, %rsi + xorq %rsi, %rcx + movq %rax, %rsi + movq %rax, %r8 + rolq $24, %rsi + xorq %rdx, %rcx + shrq $11, %r8 + leaq (%rax,%rax,8), %rdx + addq %rcx, %rsi + xorq %r8, %rdx + leaq 3(%r9), %rax + movq %rsi, %r11 + movq %rsi, %r8 + xorq %rax, %rdx + shrq $11, %r8 + rolq $24, %r11 + leaq (%rsi,%rsi,8), %rax + leaq 4(%r9), %rsi + addq %rdx, %r11 + xorq %r8, %rax + addq $5, %r9 + xorq %rsi, %rax + movq %r11, %r8 + movq %r11, %rsi + shrq $11, %rsi + rolq $24, %r8 + addq %rax, %r8 + movq %rsi, %rbx + leaq (%r11,%r11,8), %rsi + xorq %rbx, %rsi + movq %r8, %rbx + leaq (%r8,%r8,8), %r11 + rolq $24, %r8 + xorq %r9, %rsi + shrq $11, %rbx + xorq %rbx, %r11 + addq %rsi, %r8 + movq -192(%rbp), %rbx + movl %esi, %esi + movq %r11, %xmm0 + movq %r8, %xmm6 + movl %ecx, %r11d + movabsq $34359738359, %r8 + punpcklqdq %xmm6, %xmm0 + movq %r9, 16(%rbx) + movl %edx, %r9d + movups %xmm0, (%rbx) + movq %r14, %rbx + shrq $3, %rbx + cmpq %r8, %r14 + movl $4294967295, %r8d + movl %edi, %r14d + cmova %r8, %rbx + shrq $32, %rdi + movl %eax, %r8d + shrq $32, %rcx + shrq $32, %rdx + imulq %rbx, %r14 + shrq $32, %rax + imulq %rbx, %rdi + imulq %rbx, %r11 + imulq %rbx, %rcx + shrq $32, %r14 + imulq %rbx, %r9 + shrq $32, %rdi + salq $6, %r14 + imulq %rbx, %rdx + shrq $32, %r11 + salq $6, %rdi + addq %r15, %r14 + imulq %rbx, %r8 + shrq $32, %rcx + salq $6, %r11 + addq %r15, %rdi + shrq $32, %r9 + salq $6, %rcx + addq %r15, %r11 + shrq $32, %rdx + salq $6, %r9 + addq %r15, %rcx + shrq $32, %r8 + salq $6, %rdx + addq %r15, %r9 + salq $6, %r8 + addq %r15, %rdx + addq %r15, %r8 + imulq %rbx, %rax + imulq %rbx, %rsi + xorl %ebx, %ebx + shrq $32, %rax + shrq $32, %rsi + salq $6, %rax + salq $6, %rsi + addq %r15, %rax + addq %r15, %rsi +.L1250: + movdqa (%r11,%rbx,8), %xmm2 + movdqa (%r14,%rbx,8), %xmm4 + movdqa (%rdi,%rbx,8), %xmm0 + movdqa %xmm2, %xmm3 + movdqa %xmm4, %xmm1 + pcmpeqd %xmm4, %xmm3 + psubq %xmm2, %xmm1 + pand %xmm3, %xmm1 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm4, %xmm3 + por %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm5 + pand %xmm1, %xmm3 + pandn %xmm2, %xmm5 + pand %xmm1, %xmm2 + por %xmm5, %xmm3 + movdqa %xmm1, %xmm5 + pandn %xmm4, %xmm5 + movdqa %xmm0, %xmm4 + movdqa %xmm3, %xmm1 + pcmpeqd %xmm3, %xmm4 + psubq %xmm0, %xmm1 + por %xmm5, %xmm2 + pand %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm3, %xmm4 + por %xmm4, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm0 + pandn %xmm3, %xmm4 + movdqa %xmm2, %xmm3 + por %xmm4, %xmm0 + movdqa (%rcx,%rbx,8), %xmm4 + pcmpeqd %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + psubq %xmm2, %xmm1 + pand %xmm3, %xmm1 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm0, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm1, %xmm0 + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm3 + movdqa (%rdx,%rbx,8), %xmm2 + por %xmm3, %xmm0 + movdqa %xmm2, %xmm3 + psubq %xmm2, %xmm1 + movaps %xmm0, 0(%r13,%rbx,8) + movdqa (%r9,%rbx,8), %xmm0 + pcmpeqd %xmm4, %xmm3 + pand %xmm3, %xmm1 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm4, %xmm3 + por %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm5 + pand %xmm1, %xmm3 + pandn %xmm2, %xmm5 + pand %xmm1, %xmm2 + por %xmm5, %xmm3 + movdqa %xmm1, %xmm5 + pandn %xmm4, %xmm5 + movdqa %xmm0, %xmm4 + movdqa %xmm3, %xmm1 + pcmpeqd %xmm3, %xmm4 + psubq %xmm0, %xmm1 + por %xmm5, %xmm2 + pand %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm3, %xmm4 + por %xmm4, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm0 + pandn %xmm3, %xmm4 + movdqa %xmm2, %xmm3 + por %xmm4, %xmm0 + movdqa (%r8,%rbx,8), %xmm4 + pcmpeqd %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + psubq %xmm2, %xmm1 + pand %xmm3, %xmm1 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm0, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm1, %xmm0 + movdqa %xmm4, %xmm1 + pandn %xmm2, %xmm3 + movdqa (%rsi,%rbx,8), %xmm2 + por %xmm3, %xmm0 + movdqa %xmm2, %xmm3 + psubq %xmm2, %xmm1 + movaps %xmm0, 64(%r13,%rbx,8) + movdqa (%rax,%rbx,8), %xmm0 + pcmpeqd %xmm4, %xmm3 + pand %xmm3, %xmm1 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm4, %xmm3 + por %xmm3, %xmm1 + movdqa %xmm4, %xmm3 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm5 + pand %xmm1, %xmm3 + pandn %xmm2, %xmm5 + pand %xmm1, %xmm2 + por %xmm5, %xmm3 + movdqa %xmm1, %xmm5 + pandn %xmm4, %xmm5 + movdqa %xmm0, %xmm4 + movdqa %xmm3, %xmm1 + pcmpeqd %xmm3, %xmm4 + psubq %xmm0, %xmm1 + por %xmm5, %xmm2 + pand %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pcmpgtd %xmm3, %xmm4 + por %xmm4, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm0 + pandn %xmm3, %xmm4 + movdqa %xmm2, %xmm3 + por %xmm4, %xmm0 + pcmpeqd %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + psubq %xmm2, %xmm1 + pand %xmm3, %xmm1 + movdqa %xmm2, %xmm3 + pcmpgtd %xmm0, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm1, %xmm0 + pandn %xmm2, %xmm3 + por %xmm3, %xmm0 + movaps %xmm0, 128(%r13,%rbx,8) + addq $2, %rbx + cmpq $8, %rbx + jne .L1250 + movdqa 16(%r13), %xmm0 + movdqa 0(%r13), %xmm1 + movddup 0(%r13), %xmm2 + leaq 192(%r13), %r14 + pxor %xmm2, %xmm1 + pxor %xmm2, %xmm0 + por %xmm1, %xmm0 + movdqa 32(%r13), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 48(%r13), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 64(%r13), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 80(%r13), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 96(%r13), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 112(%r13), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 128(%r13), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 144(%r13), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 160(%r13), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + movdqa 176(%r13), %xmm1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + pxor %xmm1, %xmm1 + pcmpeqd %xmm1, %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm1, %xmm0 + movmskpd %xmm0, %eax + cmpl $3, %eax + je .L1251 + movdqa .LC4(%rip), %xmm0 + movl $2, %esi + movq %r13, %rdi + movups %xmm0, 192(%r13) + movups %xmm0, 208(%r13) + movups %xmm0, 224(%r13) + movups %xmm0, 240(%r13) + movups %xmm0, 256(%r13) + call _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + pcmpeqd %xmm0, %xmm0 + movddup 0(%r13), %xmm2 + movddup 184(%r13), %xmm1 + paddq %xmm1, %xmm0 + pcmpeqd %xmm2, %xmm0 + pshufd $177, %xmm0, %xmm3 + pand %xmm3, %xmm0 + movmskpd %xmm0, %eax + cmpl $3, %eax + jne .L1253 + movq -104(%rbp), %rdi + leaq -64(%rbp), %rdx + movq %r14, %rcx + movdqa %xmm2, %xmm0 + movq -216(%rbp), %rsi + call _ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L1236 +.L1253: + movq 96(%r13), %rax + cmpq %rax, 88(%r13) + jne .L1349 + cmpq 80(%r13), %rax + jne .L1292 + cmpq 72(%r13), %rax + jne .L1350 + cmpq 64(%r13), %rax + jne .L1351 + cmpq 56(%r13), %rax + jne .L1352 + cmpq 48(%r13), %rax + jne .L1353 + cmpq 40(%r13), %rax + jne .L1354 + cmpq 32(%r13), %rax + jne .L1355 + cmpq 24(%r13), %rax + jne .L1356 + cmpq 16(%r13), %rax + jne .L1357 + cmpq 8(%r13), %rax + jne .L1358 + xorl %ebx, %ebx + movl $1, %edx + cmpq %rax, 0(%r13) + jne .L1295 +.L1294: + movq %rax, %xmm2 + punpcklqdq %xmm2, %xmm2 +.L1461: + movl $1, -268(%rbp) +.L1290: + cmpq $0, -224(%rbp) + je .L1465 + movq -216(%rbp), %rax + movq -104(%rbp), %r12 + movaps %xmm2, -80(%rbp) + leaq -2(%rax), %r15 + movdqu (%r12,%r15,8), %xmm6 + movq %r15, %rbx + movq %r15, %rax + andl $7, %ebx + movaps %xmm6, -256(%rbp) + andl $6, %eax + je .L1359 + movdqu (%r12), %xmm4 + movdqa %xmm2, %xmm1 + movaps %xmm2, -128(%rbp) + movdqa %xmm4, %xmm0 + psubq %xmm4, %xmm1 + movaps %xmm4, -144(%rbp) + pcmpeqd %xmm2, %xmm0 + pand %xmm0, %xmm1 + movdqa %xmm4, %xmm0 + pcmpgtd %xmm2, %xmm0 + por %xmm0, %xmm1 + pcmpeqd %xmm0, %xmm0 + pshufd $245, %xmm1, %xmm1 + pxor %xmm1, %xmm0 + movaps %xmm1, -96(%rbp) + movmskpd %xmm0, %r14d + movq %r14, %rdi + salq $4, %r14 + call __popcountdi2@PLT + movdqa .LC1(%rip), %xmm7 + movdqa .LC0(%rip), %xmm2 + leaq _ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %rsi + cltq + movdqa -144(%rbp), %xmm4 + movdqa -96(%rbp), %xmm1 + movq %rsi, -184(%rbp) + movq %rax, %xmm6 + movdqa %xmm2, %xmm0 + movaps %xmm2, -240(%rbp) + movdqa -128(%rbp), %xmm2 + movddup %xmm6, %xmm3 + movdqa %xmm7, %xmm6 + movdqa %xmm4, %xmm5 + movaps %xmm7, -208(%rbp) + pcmpeqd %xmm3, %xmm6 + psubq %xmm3, %xmm0 + pshufb (%rsi,%r14), %xmm5 + pcmpgtd %xmm7, %xmm3 + pand %xmm6, %xmm0 + por %xmm3, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rdx + testq %rdx, %rdx + je .L1300 + movq %xmm5, (%r12) +.L1300: + movhlps %xmm0, %xmm6 + movq %xmm6, %rdx + testq %rdx, %rdx + je .L1301 + movq -104(%rbp), %rsi + movhps %xmm5, 8(%rsi) +.L1301: + movmskpd %xmm1, %ecx + movq -104(%rbp), %rsi + movaps %xmm4, -144(%rbp) + movq %rcx, %rdi + movq %rcx, -96(%rbp) + movaps %xmm2, -128(%rbp) + leaq (%rsi,%rax,8), %r12 + call __popcountdi2@PLT + movq -96(%rbp), %rcx + movdqa -128(%rbp), %xmm2 + movdqa -144(%rbp), %xmm4 + movslq %eax, %r14 + movq -184(%rbp), %rax + salq $4, %rcx + testb $4, %r15b + movdqa %xmm4, %xmm0 + pshufb (%rax,%rcx), %xmm0 + movups %xmm0, 0(%r13) + je .L1302 + movq -104(%rbp), %rsi + movdqa -80(%rbp), %xmm6 + movdqa %xmm2, %xmm1 + movaps %xmm2, -160(%rbp) + movdqu 16(%rsi), %xmm3 + movdqa %xmm6, %xmm0 + pcmpeqd %xmm3, %xmm0 + psubq %xmm3, %xmm1 + movaps %xmm3, -144(%rbp) + pand %xmm0, %xmm1 + movdqa %xmm3, %xmm0 + pcmpgtd %xmm6, %xmm0 + por %xmm0, %xmm1 + pcmpeqd %xmm0, %xmm0 + pshufd $245, %xmm1, %xmm1 + pxor %xmm1, %xmm0 + movaps %xmm1, -128(%rbp) + movmskpd %xmm0, %ecx + movq %rcx, %rdi + movq %rcx, -96(%rbp) + call __popcountdi2@PLT + movq -96(%rbp), %rcx + movdqa -208(%rbp), %xmm7 + cltq + movdqa -240(%rbp), %xmm0 + movdqa -144(%rbp), %xmm3 + movq %rax, %xmm6 + movq -184(%rbp), %rsi + salq $4, %rcx + movdqa -128(%rbp), %xmm1 + movddup %xmm6, %xmm4 + movdqa %xmm7, %xmm6 + movdqa %xmm3, %xmm5 + movdqa -160(%rbp), %xmm2 + pcmpeqd %xmm4, %xmm6 + psubq %xmm4, %xmm0 + pshufb (%rsi,%rcx), %xmm5 + pcmpgtd %xmm7, %xmm4 + pand %xmm6, %xmm0 + por %xmm4, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rcx + testq %rcx, %rcx + je .L1303 + movq %xmm5, (%r12) +.L1303: + movhlps %xmm0, %xmm6 + movq %xmm6, %rcx + testq %rcx, %rcx + je .L1304 + movhps %xmm5, 8(%r12) +.L1304: + movmskpd %xmm1, %ecx + movaps %xmm3, -144(%rbp) + leaq (%r12,%rax,8), %r12 + movq %rcx, %rdi + movq %rcx, -96(%rbp) + movaps %xmm2, -128(%rbp) + call __popcountdi2@PLT + movq -96(%rbp), %rcx + movdqa -128(%rbp), %xmm2 + movdqa -144(%rbp), %xmm3 + movq -184(%rbp), %rsi + cltq + salq $4, %rcx + movdqa %xmm3, %xmm0 + pshufb (%rsi,%rcx), %xmm0 + movups %xmm0, 0(%r13,%r14,8) + addq %rax, %r14 + cmpq $5, %rbx + jbe .L1302 + movq -104(%rbp), %rax + movdqa -80(%rbp), %xmm6 + movdqa %xmm2, %xmm1 + movaps %xmm2, -160(%rbp) + movdqu 32(%rax), %xmm3 + movdqa %xmm6, %xmm0 + pcmpeqd %xmm3, %xmm0 + psubq %xmm3, %xmm1 + movaps %xmm3, -144(%rbp) + pand %xmm0, %xmm1 + movdqa %xmm3, %xmm0 + pcmpgtd %xmm6, %xmm0 + por %xmm0, %xmm1 + pcmpeqd %xmm0, %xmm0 + pshufd $245, %xmm1, %xmm1 + pxor %xmm1, %xmm0 + movaps %xmm1, -128(%rbp) + movmskpd %xmm0, %ecx + movq %rcx, %rdi + movq %rcx, -96(%rbp) + call __popcountdi2@PLT + movq -96(%rbp), %rcx + movdqa -208(%rbp), %xmm7 + cltq + movdqa -240(%rbp), %xmm0 + movdqa -144(%rbp), %xmm3 + movq %rax, %xmm5 + movq -184(%rbp), %rsi + salq $4, %rcx + movdqa -128(%rbp), %xmm1 + movddup %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + movdqa %xmm3, %xmm6 + movdqa -160(%rbp), %xmm2 + pcmpeqd %xmm4, %xmm5 + psubq %xmm4, %xmm0 + pshufb (%rsi,%rcx), %xmm6 + pcmpgtd %xmm7, %xmm4 + pand %xmm5, %xmm0 + por %xmm4, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rcx + testq %rcx, %rcx + je .L1305 + movq %xmm6, (%r12) +.L1305: + movhlps %xmm0, %xmm5 + movq %xmm5, %rcx + testq %rcx, %rcx + je .L1306 + movhps %xmm6, 8(%r12) +.L1306: + movmskpd %xmm1, %ecx + movaps %xmm3, -144(%rbp) + leaq (%r12,%rax,8), %r12 + movq %rcx, %rdi + movq %rcx, -96(%rbp) + movaps %xmm2, -128(%rbp) + call __popcountdi2@PLT + movq -96(%rbp), %rcx + movdqa -128(%rbp), %xmm2 + movdqa -144(%rbp), %xmm3 + movq -184(%rbp), %rsi + cltq + salq $4, %rcx + movdqa %xmm3, %xmm0 + pshufb (%rsi,%rcx), %xmm0 + movups %xmm0, 0(%r13,%r14,8) + addq %rax, %r14 +.L1302: + leaq -2(%rbx), %rax + leaq 1(%rbx), %rcx + andq $-2, %rax + leaq 0(,%r14,8), %r8 + addq $2, %rax + cmpq $2, %rcx + movl $2, %ecx + cmovbe %rcx, %rax +.L1299: + cmpq %rax, %rbx + je .L1307 + subq %rax, %rbx + movdqa -208(%rbp), %xmm4 + movq -104(%rbp), %rsi + movq %r8, -176(%rbp) + movq %rbx, %xmm0 + movdqa -240(%rbp), %xmm1 + movdqa -80(%rbp), %xmm6 + movaps %xmm2, -160(%rbp) + punpcklqdq %xmm0, %xmm0 + movdqa %xmm4, %xmm3 + movdqu (%rsi,%rax,8), %xmm5 + pcmpeqd %xmm0, %xmm3 + psubq %xmm0, %xmm1 + movdqa %xmm6, %xmm7 + pcmpgtd %xmm4, %xmm0 + pcmpeqd %xmm5, %xmm7 + movaps %xmm5, -144(%rbp) + pand %xmm3, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm4 + movdqa %xmm2, %xmm0 + psubq %xmm5, %xmm0 + movaps %xmm4, -96(%rbp) + pand %xmm7, %xmm0 + movdqa %xmm5, %xmm7 + pcmpgtd %xmm6, %xmm7 + por %xmm7, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm6 + movaps %xmm0, -128(%rbp) + pandn %xmm4, %xmm6 + movmskpd %xmm6, %ebx + movq %rbx, %rdi + salq $4, %rbx + call __popcountdi2@PLT + movdqa -208(%rbp), %xmm4 + movdqa -240(%rbp), %xmm1 + cltq + movdqa -144(%rbp), %xmm5 + movq -184(%rbp), %rsi + movq %rax, %xmm6 + movdqa -128(%rbp), %xmm0 + movdqa -160(%rbp), %xmm2 + movddup %xmm6, %xmm3 + movdqa %xmm4, %xmm6 + movdqa %xmm5, %xmm7 + movq -176(%rbp), %r8 + pcmpeqd %xmm3, %xmm6 + psubq %xmm3, %xmm1 + pshufb (%rsi,%rbx), %xmm7 + pcmpgtd %xmm4, %xmm3 + movdqa -96(%rbp), %xmm4 + pand %xmm6, %xmm1 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movq %xmm1, %rcx + testq %rcx, %rcx + je .L1308 + movq %xmm7, (%r12) +.L1308: + movhlps %xmm1, %xmm6 + movq %xmm6, %rcx + testq %rcx, %rcx + je .L1309 + movhps %xmm7, 8(%r12) +.L1309: + pand %xmm4, %xmm0 + movq %r8, -128(%rbp) + leaq (%r12,%rax,8), %r12 + movmskpd %xmm0, %ebx + movaps %xmm5, -144(%rbp) + movq %rbx, %rdi + movaps %xmm2, -96(%rbp) + salq $4, %rbx + call __popcountdi2@PLT + movq -184(%rbp), %rsi + movq -128(%rbp), %r8 + movdqa -144(%rbp), %xmm5 + cltq + movdqa -96(%rbp), %xmm2 + addq %rax, %r14 + movdqa %xmm5, %xmm0 + pshufb (%rsi,%rbx), %xmm0 + movups %xmm0, 0(%r13,%r8) + leaq 0(,%r14,8), %r8 +.L1307: + movq -104(%rbp), %rax + movq %r15, %r9 + subq %r14, %r9 + leaq (%rax,%r9,8), %rax + cmpl $8, %r8d + jnb .L1310 + testl %r8d, %r8d + jne .L1466 +.L1311: + cmpl $8, %r8d + jnb .L1314 +.L1470: + testl %r8d, %r8d + jne .L1467 +.L1315: + movq %r12, %rax + subq -104(%rbp), %rax + sarq $3, %rax + subq %rax, %r15 + subq %rax, %r9 + movq %rax, -288(%rbp) + movq %r15, -280(%rbp) + movq %r9, %r14 + leaq (%r12,%r9,8), %rax + je .L1360 + movdqu (%r12), %xmm6 + leaq 64(%r12), %rcx + leaq -64(%rax), %rsi + movaps %xmm6, -304(%rbp) + movdqu 16(%r12), %xmm6 + movaps %xmm6, -320(%rbp) + movdqu 32(%r12), %xmm6 + movaps %xmm6, -336(%rbp) + movdqu 48(%r12), %xmm6 + movaps %xmm6, -352(%rbp) + movdqu -64(%rax), %xmm6 + movaps %xmm6, -368(%rbp) + movdqu -48(%rax), %xmm6 + movaps %xmm6, -384(%rbp) + movdqu -32(%rax), %xmm6 + movaps %xmm6, -400(%rbp) + movdqu -16(%rax), %xmm6 + movaps %xmm6, -416(%rbp) + cmpq %rsi, %rcx + je .L1361 + movq %r13, -424(%rbp) + xorl %ebx, %ebx + movl $2, %r15d + movq %rsi, %r13 + movaps %xmm2, -96(%rbp) + jmp .L1322 + .p2align 4,,10 + .p2align 3 +.L1469: + movdqu -64(%r13), %xmm5 + movdqu -48(%r13), %xmm4 + prefetcht0 -256(%r13) + subq $64, %r13 + movdqu 32(%r13), %xmm3 + movdqu 48(%r13), %xmm1 +.L1321: + movdqa -80(%rbp), %xmm7 + movdqa -96(%rbp), %xmm0 + movq %rcx, -112(%rbp) + movaps %xmm1, -176(%rbp) + movdqa %xmm7, %xmm2 + psubq %xmm5, %xmm0 + movaps %xmm3, -160(%rbp) + pcmpeqd %xmm5, %xmm2 + movaps %xmm4, -144(%rbp) + pand %xmm2, %xmm0 + movdqa %xmm5, %xmm2 + pcmpgtd %xmm7, %xmm2 + pshufd $78, %xmm5, %xmm7 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm6 + movdqa %xmm0, %xmm2 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm6 + punpckhqdq %xmm0, %xmm2 + pandn %xmm6, %xmm2 + movdqa %xmm2, %xmm6 + pand %xmm7, %xmm2 + pandn %xmm5, %xmm6 + por %xmm6, %xmm2 + movaps %xmm2, -128(%rbp) + call __popcountdi2@PLT + movdqa -128(%rbp), %xmm2 + movdqa -80(%rbp), %xmm7 + leaq -2(%r14,%rbx), %rdx + movdqa -144(%rbp), %xmm4 + movdqa -96(%rbp), %xmm0 + cltq + movups %xmm2, (%r12,%rbx,8) + addq $2, %rbx + movups %xmm2, (%r12,%rdx,8) + movdqa %xmm7, %xmm2 + psubq %xmm4, %xmm0 + pshufd $78, %xmm4, %xmm6 + pcmpeqd %xmm4, %xmm2 + subq %rax, %rbx + pand %xmm2, %xmm0 + movdqa %xmm4, %xmm2 + pcmpgtd %xmm7, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm5 + movdqa %xmm0, %xmm2 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm5 + punpckhqdq %xmm0, %xmm2 + pandn %xmm5, %xmm2 + movdqa %xmm2, %xmm5 + pand %xmm6, %xmm2 + pandn %xmm4, %xmm5 + por %xmm5, %xmm2 + movaps %xmm2, -128(%rbp) + call __popcountdi2@PLT + movdqa -128(%rbp), %xmm2 + movdqa -80(%rbp), %xmm7 + leaq -4(%r14,%rbx), %rdx + movdqa -160(%rbp), %xmm3 + movdqa -96(%rbp), %xmm0 + cltq + movups %xmm2, (%r12,%rbx,8) + movups %xmm2, (%r12,%rdx,8) + movdqa %xmm7, %xmm2 + psubq %xmm3, %xmm0 + movq %r15, %rdx + pcmpeqd %xmm3, %xmm2 + pshufd $78, %xmm3, %xmm5 + subq %rax, %rdx + addq %rdx, %rbx + pand %xmm2, %xmm0 + movdqa %xmm3, %xmm2 + pcmpgtd %xmm7, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm4 + movdqa %xmm0, %xmm2 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm4 + punpckhqdq %xmm0, %xmm2 + pandn %xmm4, %xmm2 + movdqa %xmm2, %xmm4 + pand %xmm5, %xmm2 + pandn %xmm3, %xmm4 + por %xmm4, %xmm2 + movaps %xmm2, -128(%rbp) + call __popcountdi2@PLT + movdqa -128(%rbp), %xmm2 + movdqa -80(%rbp), %xmm7 + movq %r15, %rdx + movdqa -176(%rbp), %xmm1 + leaq -6(%r14,%rbx), %rdi + movdqa -96(%rbp), %xmm0 + cltq + movups %xmm2, (%r12,%rbx,8) + subq %rax, %rdx + subq $8, %r14 + movups %xmm2, (%r12,%rdi,8) + movdqa %xmm7, %xmm2 + psubq %xmm1, %xmm0 + pshufd $78, %xmm1, %xmm4 + pcmpeqd %xmm1, %xmm2 + addq %rdx, %rbx + pand %xmm2, %xmm0 + movdqa %xmm1, %xmm2 + pcmpgtd %xmm7, %xmm2 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm2 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm2 + pandn %xmm3, %xmm2 + movdqa %xmm2, %xmm3 + pandn %xmm1, %xmm3 + movdqa %xmm2, %xmm1 + pand %xmm4, %xmm1 + por %xmm3, %xmm1 + movaps %xmm1, -128(%rbp) + call __popcountdi2@PLT + movdqa -128(%rbp), %xmm1 + leaq (%rbx,%r14), %rdx + movq -112(%rbp), %rcx + cltq + movups %xmm1, (%r12,%rbx,8) + movups %xmm1, (%r12,%rdx,8) + movq %r15, %rdx + subq %rax, %rdx + addq %rdx, %rbx + cmpq %r13, %rcx + je .L1468 +.L1322: + movq %rcx, %rax + subq %r12, %rax + sarq $3, %rax + subq %rbx, %rax + cmpq $8, %rax + ja .L1469 + movdqu (%rcx), %xmm5 + movdqu 16(%rcx), %xmm4 + prefetcht0 256(%rcx) + addq $64, %rcx + movdqu -32(%rcx), %xmm3 + movdqu -16(%rcx), %xmm1 + jmp .L1321 + .p2align 4,,10 + .p2align 3 +.L1296: + cmpq $23, %rdx + je .L1460 +.L1295: + movq %rdx, %rcx + addq $1, %rdx + cmpq %rax, 0(%r13,%rdx,8) + je .L1296 + movl $12, %edx + subq $11, %rcx + subq %rbx, %rdx + cmpq %rdx, %rcx + jb .L1294 +.L1460: + movq 0(%r13,%rbx,8), %rax + jmp .L1294 + .p2align 4,,10 + .p2align 3 +.L1464: + movq -216(%rbp), %rsi + movl $8, %edx + subq %rax, %rdx + leaq -8(%rax,%rsi), %r14 + leaq (%rdi,%rdx,8), %r15 + movq %r8, %rax + jmp .L1248 + .p2align 4,,10 + .p2align 3 +.L1468: + movdqa -96(%rbp), %xmm2 + movq -424(%rbp), %r13 + leaq (%r12,%rbx,8), %rcx + leaq (%r14,%rbx), %r15 + addq $2, %rbx +.L1319: + movdqa -304(%rbp), %xmm7 + movdqa -80(%rbp), %xmm6 + movdqa %xmm2, %xmm0 + movq %rcx, -144(%rbp) + movaps %xmm2, -128(%rbp) + movdqa %xmm7, %xmm1 + psubq %xmm7, %xmm0 + pshufd $78, %xmm7, %xmm4 + pcmpeqd %xmm6, %xmm1 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -96(%rbp) + call __popcountdi2@PLT + movdqa -96(%rbp), %xmm1 + movdqa -80(%rbp), %xmm6 + movdqa -320(%rbp), %xmm7 + movq -144(%rbp), %rcx + cltq + movdqa -128(%rbp), %xmm2 + subq %rax, %rbx + movups %xmm1, (%rcx) + pshufd $78, %xmm7, %xmm4 + movups %xmm1, -16(%r12,%r15,8) + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -96(%rbp) + call __popcountdi2@PLT + movdqa -96(%rbp), %xmm1 + movdqa -80(%rbp), %xmm6 + leaq -4(%r14,%rbx), %rcx + movdqa -336(%rbp), %xmm7 + movdqa -128(%rbp), %xmm2 + cltq + movups %xmm1, (%r12,%rbx,8) + subq %rax, %rbx + movups %xmm1, (%r12,%rcx,8) + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + leaq 2(%rbx), %r15 + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + pshufd $78, %xmm7, %xmm4 + movl $2, %ebx + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -96(%rbp) + call __popcountdi2@PLT + movdqa -96(%rbp), %xmm1 + movdqa -80(%rbp), %xmm6 + leaq -6(%r14,%r15), %rcx + movdqa -352(%rbp), %xmm7 + movdqa -128(%rbp), %xmm2 + cltq + movups %xmm1, (%r12,%r15,8) + movups %xmm1, (%r12,%rcx,8) + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + movq %rbx, %rcx + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + pshufd $78, %xmm7, %xmm4 + subq %rax, %rcx + addq %rcx, %r15 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -96(%rbp) + call __popcountdi2@PLT + movdqa -96(%rbp), %xmm1 + movdqa -80(%rbp), %xmm6 + leaq -8(%r14,%r15), %rcx + movdqa -368(%rbp), %xmm7 + movdqa -128(%rbp), %xmm2 + cltq + movups %xmm1, (%r12,%r15,8) + movups %xmm1, (%r12,%rcx,8) + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + movq %rbx, %rcx + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + pshufd $78, %xmm7, %xmm4 + subq %rax, %rcx + addq %rcx, %r15 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -96(%rbp) + call __popcountdi2@PLT + movdqa -96(%rbp), %xmm1 + movdqa -80(%rbp), %xmm6 + leaq -10(%r14,%r15), %rcx + movdqa -384(%rbp), %xmm7 + movdqa -128(%rbp), %xmm2 + cltq + movups %xmm1, (%r12,%r15,8) + movups %xmm1, (%r12,%rcx,8) + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + movq %rbx, %rcx + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + pshufd $78, %xmm7, %xmm4 + subq %rax, %rcx + addq %rcx, %r15 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -96(%rbp) + call __popcountdi2@PLT + movdqa -96(%rbp), %xmm1 + leaq -12(%r14,%r15), %rcx + cltq + movups %xmm1, (%r12,%r15,8) + movups %xmm1, (%r12,%rcx,8) + movdqa -128(%rbp), %xmm2 + movdqa -80(%rbp), %xmm6 + movq %rbx, %rcx + movdqa -400(%rbp), %xmm7 + subq %rax, %rcx + movdqa %xmm2, %xmm0 + addq %rcx, %r15 + movdqa %xmm7, %xmm1 + psubq %xmm7, %xmm0 + pshufd $78, %xmm7, %xmm4 + pcmpeqd %xmm6, %xmm1 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -96(%rbp) + call __popcountdi2@PLT + movdqa -96(%rbp), %xmm1 + leaq -14(%r14,%r15), %rcx + movdqa -416(%rbp), %xmm7 + movdqa -80(%rbp), %xmm6 + movdqa -128(%rbp), %xmm2 + cltq + movups %xmm1, (%r12,%r15,8) + pshufd $78, %xmm7, %xmm4 + movups %xmm1, (%r12,%rcx,8) + movdqa %xmm7, %xmm1 + movdqa %xmm2, %xmm0 + movq %rbx, %rcx + pcmpeqd %xmm6, %xmm1 + psubq %xmm7, %xmm0 + subq %rax, %rcx + addq %rcx, %r15 + pand %xmm1, %xmm0 + movdqa %xmm7, %xmm1 + pcmpgtd %xmm6, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm1 + movmskpd %xmm0, %edi + punpcklqdq %xmm0, %xmm3 + punpckhqdq %xmm0, %xmm1 + pandn %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + pand %xmm4, %xmm1 + pandn %xmm7, %xmm3 + por %xmm3, %xmm1 + movaps %xmm1, -96(%rbp) + call __popcountdi2@PLT + movdqa -96(%rbp), %xmm1 + leaq -16(%r14,%r15), %rcx + movdqa -128(%rbp), %xmm2 + cltq + movups %xmm1, (%r12,%r15,8) + subq %rax, %rbx + movups %xmm1, (%r12,%rcx,8) + movq -280(%rbp), %rcx + leaq (%rbx,%r15), %r14 + leaq 0(,%r14,8), %r15 + subq %r14, %rcx +.L1318: + movq -280(%rbp), %rsi + cmpq $2, %rcx + movdqa -80(%rbp), %xmm7 + leaq -16(,%rsi,8), %rax + cmovnb %r15, %rax + movdqu (%r12,%rax), %xmm6 + movups %xmm6, (%r12,%rsi,8) + movaps %xmm6, -96(%rbp) + movdqa -256(%rbp), %xmm6 + movdqa %xmm6, %xmm0 + psubq %xmm6, %xmm2 + pcmpeqd %xmm7, %xmm0 + movdqa %xmm2, %xmm1 + pand %xmm0, %xmm1 + movdqa %xmm6, %xmm0 + pcmpgtd %xmm7, %xmm0 + por %xmm0, %xmm1 + pcmpeqd %xmm0, %xmm0 + pshufd $245, %xmm1, %xmm1 + pxor %xmm1, %xmm0 + movaps %xmm1, -80(%rbp) + movmskpd %xmm0, %ebx + movq %rbx, %rdi + salq $4, %rbx + call __popcountdi2@PLT + movdqa -240(%rbp), %xmm0 + movdqa -80(%rbp), %xmm1 + cltq + movq -184(%rbp), %rsi + movdqa -256(%rbp), %xmm4 + movq %rax, %xmm6 + movddup %xmm6, %xmm2 + movdqa -208(%rbp), %xmm6 + pshufb (%rsi,%rbx), %xmm4 + psubq %xmm2, %xmm0 + movdqa %xmm6, %xmm3 + pcmpeqd %xmm2, %xmm3 + pcmpgtd %xmm6, %xmm2 + pand %xmm3, %xmm0 + por %xmm2, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rcx + testq %rcx, %rcx + je .L1324 + movq %xmm4, (%r12,%r15) +.L1324: + movhlps %xmm0, %xmm6 + movq %xmm6, %rcx + testq %rcx, %rcx + je .L1325 + movhps %xmm4, 8(%r12,%r15) +.L1325: + movmskpd %xmm1, %r15d + addq %rax, %r14 + movq %r15, %rdi + salq $4, %r15 + leaq 0(,%r14,8), %rbx + call __popcountdi2@PLT + movdqa -208(%rbp), %xmm6 + movdqa -240(%rbp), %xmm0 + cltq + movq -184(%rbp), %rsi + movdqa -256(%rbp), %xmm2 + movq %rax, %xmm1 + movdqa %xmm6, %xmm3 + punpcklqdq %xmm1, %xmm1 + pshufb (%rsi,%r15), %xmm2 + pcmpeqd %xmm1, %xmm3 + psubq %xmm1, %xmm0 + pcmpgtd %xmm6, %xmm1 + pand %xmm3, %xmm0 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L1326 + movq %xmm2, (%r12,%r14,8) +.L1326: + movhlps %xmm0, %xmm6 + movq %xmm6, %rax + testq %rax, %rax + je .L1327 + movhps %xmm2, 8(%r12,%rbx) +.L1327: + movq -288(%rbp), %rbx + addq %r14, %rbx + movq -224(%rbp), %r14 + subq $1, %r14 + cmpl $2, -268(%rbp) + je .L1329 + movq -192(%rbp), %r8 + movq %r14, %r9 + movq %r13, %rcx + movq %rbx, %rdx + movq -264(%rbp), %rsi + movq -104(%rbp), %rdi + call _ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + cmpl $3, -268(%rbp) + je .L1236 +.L1329: + movq -216(%rbp), %rdx + movq -104(%rbp), %rax + movq %r14, %r9 + movq %r13, %rcx + movq -192(%rbp), %r8 + movq -264(%rbp), %rsi + subq %rbx, %rdx + leaq (%rax,%rbx,8), %rdi + call _ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1236: + addq $392, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1310: + .cfi_restore_state + movq (%rax), %rcx + leaq 8(%r12), %rdi + andq $-8, %rdi + movq %rcx, (%r12) + movl %r8d, %ecx + movq -8(%rax,%rcx), %rsi + movq %rsi, -8(%r12,%rcx) + movq %r12, %rcx + movq %rax, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r8d, %ecx + shrl $3, %ecx + rep movsq + cmpl $8, %r8d + jb .L1470 +.L1314: + movq 0(%r13), %rcx + leaq 8(%rax), %rdi + andq $-8, %rdi + movq %rcx, (%rax) + movl %r8d, %ecx + movq -8(%r13,%rcx), %rsi + movq %rsi, -8(%rax,%rcx) + subq %rdi, %rax + movq %r13, %rsi + leal (%r8,%rax), %ecx + subq %rax, %rsi + shrl $3, %ecx + rep movsq + jmp .L1315 +.L1467: + movzbl 0(%r13), %ecx + movb %cl, (%rax) + jmp .L1315 +.L1466: + movzbl (%rax), %ecx + movb %cl, (%r12) + jmp .L1311 +.L1463: + cmpq $1, %rdx + jbe .L1236 + leaq 256(%rdi), %rax + cmpq %rax, %rsi + jb .L1240 + movl $2, %esi + call _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + jmp .L1236 +.L1251: + movq %r10, %rax + movl $2, %edi + movdqa .LC1(%rip), %xmm7 + movdqa %xmm2, %xmm5 + andl $1, %eax + subq %rax, %rdi + movq -104(%rbp), %rax + movaps %xmm7, -208(%rbp) + movdqu (%rax), %xmm6 + movaps %xmm6, -80(%rbp) + movq %rdi, %xmm6 + movdqa -80(%rbp), %xmm0 + movddup %xmm6, %xmm3 + movdqa .LC0(%rip), %xmm6 + pcmpeqd %xmm2, %xmm0 + movdqa %xmm6, %xmm1 + movaps %xmm6, -240(%rbp) + movdqa %xmm7, %xmm6 + pcmpeqd %xmm3, %xmm6 + psubq %xmm3, %xmm1 + pcmpgtd %xmm7, %xmm3 + pshufd $177, %xmm0, %xmm4 + pand %xmm4, %xmm0 + pand %xmm6, %xmm1 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + pandn %xmm1, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1471 + movq -104(%rbp), %rax + pxor %xmm1, %xmm1 + movq -216(%rbp), %r8 + pxor %xmm6, %xmm6 + movdqa %xmm1, %xmm0 + leaq 256(%rax,%rdi,8), %rsi + .p2align 4,,10 + .p2align 3 +.L1257: + movq %rdi, %rcx + leaq 32(%rdi), %rdi + cmpq %rdi, %r8 + jb .L1472 + leaq -256(%rsi), %rax +.L1256: + movdqa (%rax), %xmm3 + leaq 32(%rax), %rdx + pxor %xmm2, %xmm3 + por %xmm3, %xmm0 + movdqa 16(%rax), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm1 + movdqa 32(%rax), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm0 + movdqa 48(%rax), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm1 + movdqa 64(%rax), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm0 + movdqa 80(%rax), %xmm3 + leaq 96(%rdx), %rax + pxor %xmm2, %xmm3 + por %xmm3, %xmm1 + movdqa 64(%rdx), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm0 + movdqa 80(%rdx), %xmm3 + pxor %xmm2, %xmm3 + por %xmm3, %xmm1 + cmpq %rsi, %rax + jne .L1256 + movdqa %xmm0, %xmm3 + leaq 352(%rdx), %rsi + por %xmm1, %xmm3 + pcmpeqd %xmm6, %xmm3 + pshufd $177, %xmm3, %xmm4 + pand %xmm4, %xmm3 + movmskpd %xmm3, %eax + cmpl $3, %eax + je .L1257 + movq -104(%rbp), %rax + movdqa %xmm5, %xmm0 + pcmpeqd %xmm3, %xmm3 + movq -104(%rbp), %rdx + pcmpeqd (%rax,%rcx,8), %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1259 + .p2align 4,,10 + .p2align 3 +.L1258: + addq $2, %rcx + movdqa %xmm5, %xmm0 + pcmpeqd (%rdx,%rcx,8), %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + je .L1258 +.L1259: + rep bsfl %eax, %eax + cltq + addq %rcx, %rax +.L1255: + movq -104(%rbp), %rsi + movdqa %xmm5, %xmm3 + movdqa %xmm2, %xmm0 + leaq (%rsi,%rax,8), %rdi + movq (%rdi), %rbx + movq %rbx, %xmm6 + movddup %xmm6, %xmm1 + pcmpeqd %xmm1, %xmm3 + psubq %xmm1, %xmm0 + movaps %xmm1, -96(%rbp) + pand %xmm3, %xmm0 + movdqa %xmm1, %xmm3 + pcmpgtd %xmm5, %xmm3 + por %xmm3, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %edx + testl %edx, %edx + jne .L1264 + movq -216(%rbp), %rax + xorl %r15d, %r15d + movaps %xmm5, -80(%rbp) + movq %r12, -176(%rbp) + leaq -2(%rax), %r14 + movq %rax, %r12 + movq %r13, %rax + movq %rbx, -144(%rbp) + movq %r15, %r13 + movaps %xmm1, -160(%rbp) + movq %rsi, %rbx + movq %rax, %r15 + movaps %xmm2, -128(%rbp) + jmp .L1271 + .p2align 4,,10 + .p2align 3 +.L1265: + movdqa -128(%rbp), %xmm6 + movmskpd %xmm0, %edi + movups %xmm6, (%rbx,%r14,8) + call __popcountdi2@PLT + cltq + addq %rax, %r13 + leaq -2(%r14), %rax + cmpq %rax, %r12 + jbe .L1473 + movq %rax, %r14 +.L1271: + movdqu (%rbx,%r14,8), %xmm0 + pcmpeqd -96(%rbp), %xmm0 + pshufd $177, %xmm0, %xmm1 + movdqa %xmm0, %xmm3 + movdqu (%rbx,%r14,8), %xmm0 + pand %xmm1, %xmm3 + pcmpeqd -80(%rbp), %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm1, %xmm0 + movdqa %xmm3, %xmm1 + por %xmm0, %xmm1 + movmskpd %xmm1, %eax + cmpl $3, %eax + je .L1265 + pcmpeqd %xmm4, %xmm4 + movq %r15, %rax + movq %r13, %r15 + movq -104(%rbp), %rcx + pxor %xmm4, %xmm0 + movq %rax, %r13 + leaq 2(%r14), %rdx + movdqa -80(%rbp), %xmm5 + pandn %xmm0, %xmm3 + movq -144(%rbp), %rbx + movdqa -128(%rbp), %xmm2 + movmskpd %xmm3, %eax + movdqa -160(%rbp), %xmm1 + rep bsfl %eax, %eax + cltq + addq %r14, %rax + addq $4, %r14 + movddup (%rcx,%rax,8), %xmm3 + movq -216(%rbp), %rax + movaps %xmm3, -64(%rbp) + subq %r15, %rax + cmpq %r14, %rax + jb .L1266 + .p2align 4,,10 + .p2align 3 +.L1267: + movups %xmm1, -16(%rcx,%r14,8) + movq %r14, %rdx + addq $2, %r14 + cmpq %rax, %r14 + jbe .L1267 +.L1266: + movdqa -208(%rbp), %xmm7 + subq %rdx, %rax + movdqa -240(%rbp), %xmm4 + leaq 0(,%rdx,8), %rcx + movq %rax, %xmm0 + punpcklqdq %xmm0, %xmm0 + movdqa %xmm7, %xmm6 + pcmpeqd %xmm0, %xmm6 + psubq %xmm0, %xmm4 + pcmpgtd %xmm7, %xmm0 + pand %xmm6, %xmm4 + por %xmm4, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L1275 + movq -104(%rbp), %rax + movq %rbx, (%rax,%rdx,8) +.L1275: + movhlps %xmm0, %xmm6 + movq %xmm6, %rax + testq %rax, %rax + je .L1270 + movq -104(%rbp), %rax + movq %rbx, 8(%rax,%rcx) +.L1270: + movdqa %xmm5, %xmm0 + pcmpeqd .LC5(%rip), %xmm0 + pshufd $177, %xmm0, %xmm4 + pand %xmm4, %xmm0 + movmskpd %xmm0, %eax + cmpl $3, %eax + je .L1347 + movdqa %xmm5, %xmm0 + pcmpeqd .LC6(%rip), %xmm0 + pshufd $177, %xmm0, %xmm4 + pand %xmm4, %xmm0 + movmskpd %xmm0, %eax + movl %eax, -268(%rbp) + cmpl $3, %eax + je .L1474 + movdqa %xmm1, %xmm4 + movdqa %xmm1, %xmm0 + movdqa %xmm1, %xmm6 + pcmpeqd %xmm3, %xmm4 + psubq %xmm3, %xmm0 + pand %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pcmpgtd %xmm1, %xmm4 + por %xmm4, %xmm0 + pshufd $245, %xmm0, %xmm0 + movdqa %xmm0, %xmm4 + pand %xmm0, %xmm6 + pandn %xmm3, %xmm4 + por %xmm4, %xmm6 + movdqa %xmm6, %xmm7 + movdqa %xmm6, %xmm4 + pcmpeqd %xmm5, %xmm7 + psubq %xmm2, %xmm4 + pand %xmm7, %xmm4 + movdqa %xmm5, %xmm7 + pcmpgtd %xmm6, %xmm7 + por %xmm7, %xmm4 + pshufd $245, %xmm4, %xmm4 + movmskpd %xmm4, %eax + testl %eax, %eax + jne .L1475 + movdqa %xmm2, %xmm0 + movl $32, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1281: + movq -104(%rbp), %rdi + leaq (%rcx,%rax,2), %rdx + movdqa %xmm0, %xmm1 + addq $1, %rax + movdqu (%rdi,%rdx,8), %xmm3 + movdqa %xmm3, %xmm4 + psubq %xmm3, %xmm1 + pcmpeqd %xmm0, %xmm4 + pand %xmm4, %xmm1 + movdqa %xmm3, %xmm4 + pcmpgtd %xmm0, %xmm4 + por %xmm4, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm0, %xmm1 + pandn %xmm3, %xmm4 + por %xmm4, %xmm1 + movdqa %xmm1, %xmm0 + cmpq $16, %rax + jne .L1281 + movdqa %xmm1, %xmm3 + psubq %xmm2, %xmm1 + pcmpeqd %xmm5, %xmm3 + pand %xmm3, %xmm1 + movdqa %xmm5, %xmm3 + pcmpgtd %xmm0, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movmskpd %xmm1, %eax + testl %eax, %eax + jne .L1461 + leaq 32(%rsi), %rax + cmpq %rax, -216(%rbp) + jb .L1476 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1281 +.L1359: + movdqa .LC0(%rip), %xmm6 + leaq _ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices(%rip), %rsi + xorl %r8d, %r8d + xorl %r14d, %r14d + movq %rsi, -184(%rbp) + movaps %xmm6, -240(%rbp) + movdqa .LC1(%rip), %xmm6 + movaps %xmm6, -208(%rbp) + jmp .L1299 +.L1361: + movq %r9, %r15 + movq %r12, %rcx + movl $2, %ebx + jmp .L1319 +.L1360: + movq %r15, %rcx + xorl %r15d, %r15d + jmp .L1318 +.L1465: + movq -216(%rbp), %rsi + movq -104(%rbp), %rdi + leaq -1(%rsi), %rbx + movq %rbx, %r13 + shrq %r13 + .p2align 4,,10 + .p2align 3 +.L1297: + movq %r13, %rdx + call _ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + subq $1, %r13 + jnb .L1297 + .p2align 4,,10 + .p2align 3 +.L1298: + movq (%rdi,%rbx,8), %rdx + movq (%rdi), %rax + movq %rbx, %rsi + movq %rdx, (%rdi) + xorl %edx, %edx + movq %rax, (%rdi,%rbx,8) + call _ZN3hwy7N_SSSE36detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + subq $1, %rbx + jne .L1298 + jmp .L1236 +.L1349: + movl $12, %edx + movl $11, %ebx + jmp .L1295 +.L1350: + movl $9, %ebx + movl $10, %edx + jmp .L1295 +.L1351: + movl $9, %edx + jmp .L1295 +.L1352: + movl $8, %edx + movl $7, %ebx + jmp .L1295 +.L1353: + movl $6, %ebx + movl $7, %edx + jmp .L1295 +.L1472: + movq -104(%rbp), %rdi + movq -216(%rbp), %rsi + pcmpeqd %xmm3, %xmm3 + .p2align 4,,10 + .p2align 3 +.L1261: + movq %rcx, %rdx + addq $2, %rcx + cmpq %rcx, %rsi + jb .L1477 + movdqa %xmm5, %xmm0 + pcmpeqd -16(%rdi,%rcx,8), %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + je .L1261 +.L1458: + rep bsfl %eax, %eax + cltq + addq %rdx, %rax + jmp .L1255 +.L1354: + movl $5, %ebx + movl $6, %edx + jmp .L1295 +.L1355: + movl $4, %ebx + movl $5, %edx + jmp .L1295 +.L1264: + leaq -64(%rbp), %rdx + movq %r13, %rcx + movdqa %xmm2, %xmm0 + movaps %xmm1, -80(%rbp) + movq -216(%rbp), %rsi + subq %rax, %rsi + call _ZN3hwy7N_SSSE36detail22MaybePartitionTwoValueINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L1236 + movddup 0(%r13), %xmm2 + movdqa -64(%rbp), %xmm3 + movdqa -80(%rbp), %xmm1 + movdqa %xmm2, %xmm5 + jmp .L1270 +.L1356: + movl $3, %ebx + movl $4, %edx + jmp .L1295 +.L1357: + movl $2, %ebx + movl $3, %edx + jmp .L1295 +.L1358: + movl $1, %ebx + movl $2, %edx + jmp .L1295 +.L1473: + movq %r14, %xmm6 + movq %r15, %rax + movq %r13, %r15 + movdqa -240(%rbp), %xmm3 + movddup %xmm6, %xmm0 + movq %rax, %r13 + movdqa -208(%rbp), %xmm6 + movq -104(%rbp), %rax + psubq %xmm0, %xmm3 + movdqa -80(%rbp), %xmm5 + movdqa -160(%rbp), %xmm1 + movdqa %xmm6, %xmm4 + movq -216(%rbp), %rdx + movq -176(%rbp), %r12 + pcmpeqd %xmm0, %xmm4 + pcmpgtd %xmm6, %xmm0 + movdqu (%rax), %xmm6 + movq -144(%rbp), %rbx + movdqa -128(%rbp), %xmm2 + subq %r15, %rdx + movaps %xmm6, -80(%rbp) + movdqa -80(%rbp), %xmm6 + pand %xmm4, %xmm3 + pcmpeqd %xmm5, %xmm6 + por %xmm0, %xmm3 + movdqa -80(%rbp), %xmm0 + pshufd $245, %xmm3, %xmm3 + pcmpeqd %xmm1, %xmm0 + pshufd $177, %xmm6, %xmm4 + pand %xmm3, %xmm4 + pand %xmm6, %xmm4 + pshufd $177, %xmm0, %xmm7 + pcmpeqd %xmm6, %xmm6 + pand %xmm7, %xmm0 + pxor %xmm6, %xmm3 + por %xmm3, %xmm0 + por %xmm4, %xmm0 + movmskpd %xmm0, %eax + cmpl $3, %eax + jne .L1478 + movmskpd %xmm4, %edi + movaps %xmm2, -96(%rbp) + movaps %xmm1, -80(%rbp) + movq %rdx, -128(%rbp) + call __popcountdi2@PLT + movq -104(%rbp), %rsi + movdqa -96(%rbp), %xmm2 + movslq %eax, %rcx + movq -128(%rbp), %rax + movdqa -80(%rbp), %xmm1 + movups %xmm2, (%rsi) + subq %rcx, %rax + cmpq $1, %rax + jbe .L1336 + leaq -2(%rax), %rcx + movq %rcx, %rdx + shrq %rdx + salq $4, %rdx + leaq 16(%rsi,%rdx), %rdx +.L1278: + movups %xmm1, (%r12) + addq $16, %r12 + cmpq %rdx, %r12 + jne .L1278 + andq $-2, %rcx + movq %rcx, %rdx + addq $2, %rdx + leaq 0(,%rdx,8), %rcx + subq %rdx, %rax +.L1277: + movaps %xmm1, 0(%r13) + testq %rax, %rax + je .L1236 + movq -104(%rbp), %rdi + leaq 0(,%rax,8), %rdx + movq %r13, %rsi + addq %rcx, %rdi + call memcpy@PLT + jmp .L1236 +.L1476: + movq -216(%rbp), %rcx + movq %rdi, %rdx + jmp .L1288 +.L1289: + movdqu -16(%rdx,%rsi,8), %xmm1 + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm0 + pcmpeqd %xmm5, %xmm3 + psubq %xmm2, %xmm0 + pand %xmm3, %xmm0 + movdqa %xmm5, %xmm3 + pcmpgtd %xmm1, %xmm3 + por %xmm3, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1461 +.L1288: + movq %rsi, %rax + addq $2, %rsi + cmpq %rsi, %rcx + jnb .L1289 + movq -216(%rbp), %rsi + cmpq %rax, %rsi + je .L1347 + movq -104(%rbp), %rax + movdqu -16(%rax,%rsi,8), %xmm1 + movdqa %xmm1, %xmm3 + movdqa %xmm1, %xmm0 + pcmpeqd %xmm5, %xmm3 + pcmpgtd %xmm1, %xmm5 + psubq %xmm2, %xmm0 + pand %xmm3, %xmm0 + por %xmm5, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %eax + cmpl $1, %eax + movl $1, %eax + adcl $0, %eax + movl %eax, -268(%rbp) + jmp .L1290 +.L1292: + movl $11, %edx + movl $10, %ebx + jmp .L1295 +.L1477: + movq -216(%rbp), %rax + leaq -2(%rax), %rdx + movq -104(%rbp), %rax + movdqu (%rax,%rdx,8), %xmm6 + movaps %xmm6, -80(%rbp) + movdqa -80(%rbp), %xmm0 + pcmpeqd %xmm5, %xmm0 + pshufd $177, %xmm0, %xmm1 + pand %xmm1, %xmm0 + pcmpeqd %xmm1, %xmm1 + pxor %xmm1, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + je .L1236 + jmp .L1458 +.L1471: + rep bsfl %eax, %eax + cltq + jmp .L1255 +.L1240: + movq %rdx, %r10 + leaq -2(%rdx), %rdx + movq -104(%rbp), %r11 + leaq 8(%rcx), %rdi + movq %rdx, %rbx + andq $-8, %rdi + andq $-2, %rdx + movq %r10, %r12 + shrq %rbx + movq (%r11), %rax + movq %r11, %rsi + addq $1, %rbx + salq $4, %rbx + movq %rax, (%rcx) + movl %ebx, %r8d + leaq 8(%r11,%r8), %r15 + leaq 8(%rcx,%r8), %r14 + movq -16(%r15), %rax + movq %rax, -16(%r14) + movq %rcx, %rax + subq %rdi, %rax + subq %rax, %rsi + leal (%rbx,%rax), %ecx + leaq 2(%rdx), %rax + shrl $3, %ecx + rep movsq + movq %rax, -80(%rbp) + movq %r10, -216(%rbp) + subq %rax, %r12 + je .L1241 + salq $3, %rax + leaq 0(,%r12,8), %rdx + movq %r8, -96(%rbp) + leaq (%r11,%rax), %rsi + leaq 0(%r13,%rax), %rdi + call memcpy@PLT + movq -216(%rbp), %r10 + movl $32, %ecx + movq -96(%rbp), %r8 + movl %r10d, %eax + subl $1, %eax + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rax + shrq $4, %rax + movq %rax, %rsi + movl $1, %eax + cmove %rax, %rsi + movq %rsi, %rax + salq $4, %rax + addq $2, %rax + cmpq %rax, %r10 + jnb .L1479 +.L1242: + movdqa .LC4(%rip), %xmm0 + movq -216(%rbp), %rdx +.L1246: + movups %xmm0, 0(%r13,%rdx,8) + addq $2, %rdx + cmpq %rax, %rdx + jb .L1246 + movq %r13, %rdi + movq %r8, -96(%rbp) + call _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq -104(%rbp), %rsi + movq 0(%r13), %rax + movq %rax, (%rsi) + movq -96(%rbp), %r8 + leaq 8(%rsi), %rdi + andq $-8, %rdi + movq -8(%r13,%r8), %rax + movq %rax, -8(%rsi,%r8) + movq %rsi, %rax + movq %r13, %rsi + subq %rdi, %rax + leal (%rbx,%rax), %ecx + subq %rax, %rsi + shrl $3, %ecx + rep movsq + testq %r12, %r12 + je .L1236 +.L1244: + movq -80(%rbp), %rax + movq -104(%rbp), %rdi + leaq 0(,%r12,8), %rdx + salq $3, %rax + addq %rax, %rdi + leaq 0(%r13,%rax), %rsi + call memcpy@PLT + jmp .L1236 +.L1347: + movl $2, -268(%rbp) + jmp .L1290 +.L1474: + pcmpeqd %xmm0, %xmm0 + paddq %xmm0, %xmm2 + jmp .L1290 +.L1475: + movdqa %xmm0, %xmm4 + pand %xmm3, %xmm0 + pandn %xmm1, %xmm4 + movdqa %xmm2, %xmm1 + por %xmm4, %xmm0 + movdqa %xmm0, %xmm3 + psubq %xmm0, %xmm1 + pcmpeqd %xmm5, %xmm3 + pcmpgtd %xmm5, %xmm0 + pand %xmm3, %xmm1 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1461 + movdqa %xmm2, %xmm0 + movl $32, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1282: + movq -104(%rbp), %rdi + leaq (%rcx,%rax,2), %rdx + movdqa %xmm0, %xmm1 + addq $1, %rax + movdqu (%rdi,%rdx,8), %xmm3 + movdqa %xmm3, %xmm4 + psubq %xmm3, %xmm1 + pcmpeqd %xmm0, %xmm4 + pand %xmm4, %xmm1 + movdqa %xmm3, %xmm4 + pcmpgtd %xmm0, %xmm4 + por %xmm4, %xmm1 + pshufd $245, %xmm1, %xmm1 + movdqa %xmm1, %xmm4 + pand %xmm1, %xmm3 + pandn %xmm0, %xmm4 + por %xmm4, %xmm3 + movdqa %xmm3, %xmm0 + cmpq $16, %rax + jne .L1282 + pcmpeqd %xmm5, %xmm3 + movdqa %xmm2, %xmm1 + psubq %xmm0, %xmm1 + pand %xmm3, %xmm1 + movdqa %xmm0, %xmm3 + pcmpgtd %xmm5, %xmm3 + por %xmm3, %xmm1 + pshufd $245, %xmm1, %xmm1 + movmskpd %xmm1, %eax + testl %eax, %eax + jne .L1461 + leaq 32(%rsi), %rax + cmpq %rax, -216(%rbp) + jb .L1480 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1282 +.L1241: + movq -216(%rbp), %rdi + movl $32, %ecx + movl %edi, %eax + subl $1, %eax + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rax + shrq $4, %rax + movq %rax, %rsi + movl $1, %eax + cmove %rax, %rsi + movq %rsi, %rax + salq $4, %rax + addq $2, %rax + cmpq %rax, %rdi + jb .L1242 + movq %r13, %rdi + call _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq -104(%rbp), %rsi + movq 0(%r13), %rax + movq %rax, (%rsi) + movq -16(%r14), %rax + leaq 8(%rsi), %rdi + andq $-8, %rdi + movq %rax, -16(%r15) + movq %rsi, %rax + movq %r13, %rsi + subq %rdi, %rax + leal (%rbx,%rax), %ecx + subq %rax, %rsi + shrl $3, %ecx + rep movsq + jmp .L1236 + .p2align 4,,10 + .p2align 3 +.L1336: + xorl %ecx, %ecx + jmp .L1277 +.L1478: + pxor %xmm6, %xmm0 + movq -104(%rbp), %rsi + movmskpd %xmm0, %eax + rep bsfl %eax, %eax + cltq + movddup (%rsi,%rax,8), %xmm3 + leaq 2(%r14), %rax + movaps %xmm3, -64(%rbp) + cmpq %rdx, %rax + ja .L1273 +.L1274: + movq -104(%rbp), %rsi + movq %rax, %r14 + movups %xmm1, -16(%rsi,%rax,8) + addq $2, %rax + cmpq %rdx, %rax + jbe .L1274 +.L1273: + movq %rdx, %rax + movdqa -208(%rbp), %xmm7 + movdqa -240(%rbp), %xmm0 + leaq 0(,%r14,8), %rcx + subq %r14, %rax + movq %rax, %xmm6 + movddup %xmm6, %xmm4 + movdqa %xmm7, %xmm6 + pcmpeqd %xmm4, %xmm6 + psubq %xmm4, %xmm0 + pcmpgtd %xmm7, %xmm4 + pand %xmm6, %xmm0 + por %xmm4, %xmm0 + pshufd $245, %xmm0, %xmm0 + movq %xmm0, %rax + testq %rax, %rax + je .L1275 + movq -104(%rbp), %rax + movq %rbx, (%rax,%r14,8) + jmp .L1275 +.L1480: + movq %rdi, %rdx + jmp .L1284 +.L1285: + movdqu -16(%rdx,%rsi,8), %xmm1 + movdqa %xmm2, %xmm0 + movdqa %xmm1, %xmm3 + psubq %xmm1, %xmm0 + pcmpeqd %xmm5, %xmm3 + pcmpgtd %xmm5, %xmm1 + pand %xmm3, %xmm0 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1461 +.L1284: + movq %rsi, %rax + addq $2, %rsi + cmpq %rsi, -216(%rbp) + jnb .L1285 + movq -216(%rbp), %rsi + cmpq %rax, %rsi + je .L1286 + movq -104(%rbp), %rax + movdqa %xmm2, %xmm0 + movdqu -16(%rax,%rsi,8), %xmm1 + movdqa %xmm1, %xmm3 + psubq %xmm1, %xmm0 + pcmpeqd %xmm5, %xmm3 + pcmpgtd %xmm5, %xmm1 + pand %xmm3, %xmm0 + por %xmm1, %xmm0 + pshufd $245, %xmm0, %xmm0 + movmskpd %xmm0, %eax + testl %eax, %eax + jne .L1461 +.L1286: + movl $3, -268(%rbp) + pcmpeqd %xmm0, %xmm0 + paddq %xmm0, %xmm2 + jmp .L1290 +.L1479: + movq %r13, %rdi + call _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq -104(%rbp), %rsi + movq 0(%r13), %rax + movq %rax, (%rsi) + movq -16(%r14), %rax + leaq 8(%rsi), %rdi + andq $-8, %rdi + movq %rax, -16(%r15) + movq %rsi, %rax + movq %r13, %rsi + subq %rdi, %rax + leal (%rbx,%rax), %ecx + subq %rax, %rsi + shrl $3, %ecx + rep movsq + jmp .L1244 + .cfi_endproc +.LFE18805: + .size _ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0, .-_ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + .section .text._ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0, @function +_ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0: +.LFB18807: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsi, %rax + movq %rdi, %rdx + salq $3, %rax + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + .cfi_offset 15, -24 + leaq (%rdi,%rax), %r15 + pushq %r14 + .cfi_offset 14, -32 + leaq (%r15,%rax), %r14 + pushq %r13 + .cfi_offset 13, -40 + leaq (%r14,%rax), %r13 + pushq %r12 + .cfi_offset 12, -48 + leaq 0(%r13,%rax), %r12 + pushq %rbx + .cfi_offset 3, -56 + leaq (%r12,%rax), %rbx + leaq (%rbx,%rax), %r11 + leaq (%r11,%rax), %r10 + andq $-32, %rsp + leaq (%r10,%rax), %r9 + subq $392, %rsp + leaq (%r9,%rax), %r8 + movq %rsi, 192(%rsp) + vmovdqu (%rdi), %ymm3 + vmovdqu (%r15), %ymm14 + leaq (%r8,%rax), %rdi + vmovdqu 0(%r13), %ymm11 + vmovdqu (%r14), %ymm4 + leaq (%rdi,%rax), %rsi + vpcmpgtq %ymm3, %ymm14, %ymm15 + vmovdqu (%rbx), %ymm12 + vmovdqu (%r12), %ymm1 + leaq (%rsi,%rax), %rcx + vmovdqu (%r10), %ymm9 + vmovdqu (%r11), %ymm2 + movq %rcx, 184(%rsp) + vmovdqu (%r9), %ymm8 + vmovdqu (%r8), %ymm10 + vpblendvb %ymm15, %ymm3, %ymm14, %ymm13 + vpblendvb %ymm15, %ymm14, %ymm3, %ymm3 + vmovdqu (%rsi), %ymm7 + vmovdqu (%rdi), %ymm0 + vpcmpgtq %ymm4, %ymm11, %ymm15 + vmovdqu (%rcx), %ymm5 + addq %rax, %rcx + movq %rcx, 176(%rsp) + vmovdqa %ymm5, 360(%rsp) + vmovdqu (%rcx), %ymm5 + addq %rax, %rcx + vpblendvb %ymm15, %ymm4, %ymm11, %ymm14 + vpblendvb %ymm15, %ymm11, %ymm4, %ymm4 + addq %rcx, %rax + vpcmpgtq %ymm1, %ymm12, %ymm15 + vmovdqu (%rax), %ymm6 + vpblendvb %ymm15, %ymm1, %ymm12, %ymm11 + vpblendvb %ymm15, %ymm12, %ymm1, %ymm1 + vpcmpgtq %ymm2, %ymm9, %ymm15 + vpblendvb %ymm15, %ymm2, %ymm9, %ymm12 + vpblendvb %ymm15, %ymm9, %ymm2, %ymm2 + vpcmpgtq %ymm8, %ymm10, %ymm15 + vpblendvb %ymm15, %ymm8, %ymm10, %ymm9 + vpblendvb %ymm15, %ymm10, %ymm8, %ymm15 + vmovdqa %ymm15, 328(%rsp) + vpcmpgtq %ymm0, %ymm7, %ymm8 + vmovdqa 360(%rsp), %ymm15 + vpblendvb %ymm8, %ymm0, %ymm7, %ymm10 + vpblendvb %ymm8, %ymm7, %ymm0, %ymm0 + vpcmpgtq %ymm15, %ymm5, %ymm8 + vpblendvb %ymm8, %ymm15, %ymm5, %ymm7 + vpblendvb %ymm8, %ymm5, %ymm15, %ymm5 + vmovdqa %ymm5, 360(%rsp) + vpcmpgtq (%rcx), %ymm6, %ymm5 + vpblendvb %ymm5, (%rcx), %ymm6, %ymm8 + vmovdqu (%rcx), %ymm15 + cmpq $1, 192(%rsp) + vpblendvb %ymm5, %ymm6, %ymm15, %ymm5 + vpcmpgtq %ymm13, %ymm14, %ymm15 + vpblendvb %ymm15, %ymm13, %ymm14, %ymm6 + vpblendvb %ymm15, %ymm14, %ymm13, %ymm15 + vpcmpgtq %ymm3, %ymm4, %ymm14 + vpblendvb %ymm14, %ymm3, %ymm4, %ymm13 + vpblendvb %ymm14, %ymm4, %ymm3, %ymm14 + vpcmpgtq %ymm11, %ymm12, %ymm4 + vpblendvb %ymm4, %ymm11, %ymm12, %ymm3 + vpblendvb %ymm4, %ymm12, %ymm11, %ymm4 + vpcmpgtq %ymm1, %ymm2, %ymm12 + vpblendvb %ymm12, %ymm1, %ymm2, %ymm11 + vpblendvb %ymm12, %ymm2, %ymm1, %ymm1 + vpcmpgtq %ymm9, %ymm10, %ymm12 + vpblendvb %ymm12, %ymm9, %ymm10, %ymm2 + vpblendvb %ymm12, %ymm10, %ymm9, %ymm10 + vmovdqa 328(%rsp), %ymm12 + vmovdqa %ymm10, 296(%rsp) + vpcmpgtq %ymm12, %ymm0, %ymm10 + vpblendvb %ymm10, %ymm12, %ymm0, %ymm9 + vpblendvb %ymm10, %ymm0, %ymm12, %ymm0 + vmovdqa 360(%rsp), %ymm12 + vmovdqa %ymm0, 328(%rsp) + vpcmpgtq %ymm7, %ymm8, %ymm0 + vpblendvb %ymm0, %ymm7, %ymm8, %ymm10 + vpblendvb %ymm0, %ymm8, %ymm7, %ymm7 + vpcmpgtq %ymm12, %ymm5, %ymm0 + vpblendvb %ymm0, %ymm12, %ymm5, %ymm8 + vpblendvb %ymm0, %ymm5, %ymm12, %ymm0 + vpcmpgtq %ymm6, %ymm3, %ymm12 + vpblendvb %ymm12, %ymm6, %ymm3, %ymm5 + vpblendvb %ymm12, %ymm3, %ymm6, %ymm12 + vpcmpgtq %ymm13, %ymm11, %ymm6 + vpblendvb %ymm6, %ymm13, %ymm11, %ymm3 + vpblendvb %ymm6, %ymm11, %ymm13, %ymm13 + vpcmpgtq %ymm15, %ymm4, %ymm11 + vpblendvb %ymm11, %ymm15, %ymm4, %ymm6 + vpblendvb %ymm11, %ymm4, %ymm15, %ymm15 + vpcmpgtq %ymm14, %ymm1, %ymm11 + vpblendvb %ymm11, %ymm14, %ymm1, %ymm4 + vpblendvb %ymm11, %ymm1, %ymm14, %ymm1 + vmovdqa 296(%rsp), %ymm14 + vmovdqa %ymm1, 264(%rsp) + vpcmpgtq %ymm2, %ymm10, %ymm1 + vpblendvb %ymm1, %ymm2, %ymm10, %ymm11 + vpblendvb %ymm1, %ymm10, %ymm2, %ymm2 + vpcmpgtq %ymm9, %ymm8, %ymm1 + vpblendvb %ymm1, %ymm9, %ymm8, %ymm10 + vpblendvb %ymm1, %ymm8, %ymm9, %ymm8 + vpcmpgtq %ymm14, %ymm7, %ymm1 + vpblendvb %ymm1, %ymm14, %ymm7, %ymm9 + vpblendvb %ymm1, %ymm7, %ymm14, %ymm7 + vmovdqa %ymm7, 360(%rsp) + vmovdqa 328(%rsp), %ymm7 + vpcmpgtq %ymm7, %ymm0, %ymm14 + vpblendvb %ymm14, %ymm7, %ymm0, %ymm1 + vpblendvb %ymm14, %ymm0, %ymm7, %ymm0 + vpcmpgtq %ymm5, %ymm11, %ymm14 + vpblendvb %ymm14, %ymm5, %ymm11, %ymm7 + vpblendvb %ymm14, %ymm11, %ymm5, %ymm14 + vpcmpgtq %ymm3, %ymm10, %ymm5 + vmovdqa %ymm7, 40(%rsp) + vmovdqa %ymm7, 328(%rsp) + vpblendvb %ymm5, %ymm3, %ymm10, %ymm7 + vpblendvb %ymm5, %ymm10, %ymm3, %ymm3 + vmovdqa %ymm7, 296(%rsp) + vpcmpgtq %ymm6, %ymm9, %ymm10 + vpblendvb %ymm10, %ymm6, %ymm9, %ymm5 + vpblendvb %ymm10, %ymm9, %ymm6, %ymm6 + vpcmpgtq %ymm4, %ymm1, %ymm10 + vpblendvb %ymm10, %ymm4, %ymm1, %ymm9 + vpblendvb %ymm10, %ymm1, %ymm4, %ymm4 + vmovdqa 360(%rsp), %ymm1 + vpcmpgtq %ymm12, %ymm2, %ymm10 + vpblendvb %ymm10, %ymm12, %ymm2, %ymm7 + vpblendvb %ymm10, %ymm2, %ymm12, %ymm10 + vpcmpgtq %ymm15, %ymm1, %ymm12 + vpcmpgtq %ymm13, %ymm8, %ymm2 + vmovdqa 360(%rsp), %ymm1 + vpblendvb %ymm2, %ymm13, %ymm8, %ymm11 + vpblendvb %ymm2, %ymm8, %ymm13, %ymm2 + vpblendvb %ymm12, %ymm15, %ymm1, %ymm8 + vpblendvb %ymm12, %ymm1, %ymm15, %ymm1 + vmovdqa 264(%rsp), %ymm15 + vpcmpgtq %ymm15, %ymm0, %ymm13 + vpblendvb %ymm13, %ymm15, %ymm0, %ymm12 + vpblendvb %ymm13, %ymm0, %ymm15, %ymm0 + vmovdqa 296(%rsp), %ymm15 + vmovdqa %ymm0, 136(%rsp) + vpcmpgtq %ymm11, %ymm6, %ymm13 + vpblendvb %ymm13, %ymm11, %ymm6, %ymm0 + vpblendvb %ymm13, %ymm6, %ymm11, %ymm6 + vpcmpgtq %ymm8, %ymm3, %ymm13 + vpblendvb %ymm13, %ymm8, %ymm3, %ymm11 + vpblendvb %ymm13, %ymm3, %ymm8, %ymm8 + vpcmpgtq %ymm9, %ymm10, %ymm3 + vpblendvb %ymm3, %ymm9, %ymm10, %ymm13 + vpblendvb %ymm3, %ymm10, %ymm9, %ymm9 + vpcmpgtq %ymm12, %ymm4, %ymm10 + vpblendvb %ymm10, %ymm12, %ymm4, %ymm3 + vpblendvb %ymm10, %ymm4, %ymm12, %ymm4 + vpcmpgtq %ymm2, %ymm1, %ymm12 + vpblendvb %ymm12, %ymm2, %ymm1, %ymm10 + vpblendvb %ymm12, %ymm1, %ymm2, %ymm2 + vpcmpgtq %ymm7, %ymm14, %ymm1 + vpblendvb %ymm1, %ymm7, %ymm14, %ymm12 + vpblendvb %ymm1, %ymm14, %ymm7, %ymm14 + vpcmpgtq %ymm15, %ymm5, %ymm7 + vpblendvb %ymm7, %ymm15, %ymm5, %ymm1 + vpblendvb %ymm7, %ymm5, %ymm15, %ymm5 + vpcmpgtq %ymm1, %ymm12, %ymm7 + vpblendvb %ymm7, %ymm1, %ymm12, %ymm15 + vpblendvb %ymm7, %ymm12, %ymm1, %ymm1 + vpcmpgtq %ymm3, %ymm10, %ymm7 + vmovdqa %ymm15, 8(%rsp) + vmovdqa %ymm15, 360(%rsp) + vpblendvb %ymm7, %ymm3, %ymm10, %ymm12 + vpblendvb %ymm7, %ymm10, %ymm3, %ymm10 + vpcmpgtq %ymm5, %ymm14, %ymm7 + vpblendvb %ymm7, %ymm5, %ymm14, %ymm3 + vpblendvb %ymm7, %ymm14, %ymm5, %ymm7 + vpcmpgtq %ymm4, %ymm2, %ymm5 + vpblendvb %ymm5, %ymm4, %ymm2, %ymm14 + vpblendvb %ymm5, %ymm2, %ymm4, %ymm4 + vpcmpgtq %ymm3, %ymm1, %ymm5 + vmovdqa %ymm4, 104(%rsp) + vpcmpgtq %ymm12, %ymm9, %ymm2 + vpblendvb %ymm5, %ymm3, %ymm1, %ymm4 + vpblendvb %ymm5, %ymm1, %ymm3, %ymm5 + vpcmpgtq %ymm0, %ymm11, %ymm1 + vmovdqa %ymm4, -24(%rsp) + vmovdqa %ymm4, 296(%rsp) + vpblendvb %ymm1, %ymm0, %ymm11, %ymm15 + vpblendvb %ymm1, %ymm11, %ymm0, %ymm1 + vpcmpgtq %ymm8, %ymm6, %ymm0 + vpblendvb %ymm0, %ymm8, %ymm6, %ymm11 + vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 + vpcmpgtq %ymm14, %ymm10, %ymm0 + vpblendvb %ymm0, %ymm10, %ymm14, %ymm4 + vpblendvb %ymm0, %ymm14, %ymm10, %ymm8 + vpblendvb %ymm2, %ymm12, %ymm9, %ymm0 + vmovdqa %ymm4, 72(%rsp) + vpcmpgtq %ymm13, %ymm7, %ymm4 + vpblendvb %ymm2, %ymm9, %ymm12, %ymm2 + vpblendvb %ymm4, %ymm13, %ymm7, %ymm3 + vpblendvb %ymm4, %ymm7, %ymm13, %ymm4 + vpcmpgtq %ymm3, %ymm15, %ymm7 + vpblendvb %ymm7, %ymm3, %ymm15, %ymm12 + vpblendvb %ymm7, %ymm15, %ymm3, %ymm3 + vpcmpgtq %ymm1, %ymm4, %ymm7 + vpblendvb %ymm7, %ymm1, %ymm4, %ymm10 + vpblendvb %ymm7, %ymm4, %ymm1, %ymm4 + vpcmpgtq %ymm0, %ymm11, %ymm1 + vpblendvb %ymm1, %ymm0, %ymm11, %ymm7 + vpblendvb %ymm1, %ymm11, %ymm0, %ymm0 + vpcmpgtq %ymm6, %ymm2, %ymm1 + vpblendvb %ymm1, %ymm6, %ymm2, %ymm9 + vpblendvb %ymm1, %ymm2, %ymm6, %ymm2 + vpcmpgtq %ymm12, %ymm5, %ymm6 + vpblendvb %ymm6, %ymm12, %ymm5, %ymm1 + vpblendvb %ymm6, %ymm5, %ymm12, %ymm15 + vmovdqa %ymm1, 264(%rsp) + vpcmpgtq %ymm3, %ymm10, %ymm5 + vmovdqa %ymm15, 232(%rsp) + vpblendvb %ymm5, %ymm3, %ymm10, %ymm11 + vpblendvb %ymm5, %ymm10, %ymm3, %ymm3 + vmovdqa %ymm11, 200(%rsp) + vpcmpgtq %ymm7, %ymm4, %ymm5 + vpblendvb %ymm5, %ymm7, %ymm4, %ymm6 + vpblendvb %ymm5, %ymm4, %ymm7, %ymm4 + vpcmpgtq %ymm0, %ymm9, %ymm5 + vpblendvb %ymm5, %ymm0, %ymm9, %ymm7 + vpblendvb %ymm5, %ymm9, %ymm0, %ymm0 + vpcmpgtq %ymm8, %ymm2, %ymm9 + vpblendvb %ymm9, %ymm8, %ymm2, %ymm5 + vpblendvb %ymm9, %ymm2, %ymm8, %ymm2 + vpcmpgtq %ymm3, %ymm6, %ymm8 + vpblendvb %ymm8, %ymm3, %ymm6, %ymm10 + vpblendvb %ymm8, %ymm6, %ymm3, %ymm3 + vpcmpgtq %ymm4, %ymm7, %ymm6 + vpblendvb %ymm6, %ymm4, %ymm7, %ymm12 + vpblendvb %ymm6, %ymm7, %ymm4, %ymm6 + jbe .L1486 + vmovdqa 40(%rsp), %ymm13 + vpshufd $78, 104(%rsp), %ymm4 + vpshufd $78, 136(%rsp), %ymm7 + vpshufd $78, 72(%rsp), %ymm9 + vpshufd $78, %ymm2, %ymm2 + vpshufd $78, %ymm5, %ymm5 + vpshufd $78, %ymm0, %ymm0 + vpcmpgtq %ymm13, %ymm7, %ymm8 + vpshufd $78, %ymm6, %ymm6 + vpshufd $78, %ymm12, %ymm12 + vpblendvb %ymm8, %ymm13, %ymm7, %ymm14 + vpblendvb %ymm8, %ymm7, %ymm13, %ymm7 + vmovdqa 8(%rsp), %ymm13 + vmovdqa %ymm7, 360(%rsp) + vpcmpgtq %ymm13, %ymm4, %ymm8 + vpblendvb %ymm8, %ymm13, %ymm4, %ymm7 + vpblendvb %ymm8, %ymm4, %ymm13, %ymm4 + vmovdqa -24(%rsp), %ymm13 + vmovdqa %ymm4, 328(%rsp) + vpcmpgtq %ymm13, %ymm9, %ymm8 + vpblendvb %ymm8, %ymm13, %ymm9, %ymm4 + vpblendvb %ymm8, %ymm9, %ymm13, %ymm9 + vpcmpgtq %ymm1, %ymm2, %ymm8 + vpshufd $78, %ymm9, %ymm9 + vpblendvb %ymm8, %ymm1, %ymm2, %ymm13 + vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 + vpcmpgtq %ymm15, %ymm5, %ymm8 + vpshufd $78, %ymm1, %ymm1 + vmovdqa %ymm13, 296(%rsp) + vpblendvb %ymm8, %ymm15, %ymm5, %ymm2 + vpblendvb %ymm8, %ymm5, %ymm15, %ymm5 + vpcmpgtq %ymm11, %ymm0, %ymm8 + vpshufd $78, %ymm2, %ymm2 + vmovdqa %ymm5, 264(%rsp) + vpblendvb %ymm8, %ymm11, %ymm0, %ymm5 + vpblendvb %ymm8, %ymm0, %ymm11, %ymm0 + vpcmpgtq %ymm10, %ymm6, %ymm11 + vpshufd $78, %ymm5, %ymm5 + vpblendvb %ymm11, %ymm10, %ymm6, %ymm8 + vpblendvb %ymm11, %ymm6, %ymm10, %ymm10 + vpcmpgtq %ymm3, %ymm12, %ymm6 + vpshufd $78, %ymm8, %ymm8 + vpblendvb %ymm6, %ymm3, %ymm12, %ymm11 + vpblendvb %ymm6, %ymm12, %ymm3, %ymm3 + vpshufd $78, 360(%rsp), %ymm6 + vpshufd $78, 328(%rsp), %ymm12 + vpshufd $78, %ymm11, %ymm11 + vpcmpgtq %ymm14, %ymm11, %ymm15 + vpblendvb %ymm15, %ymm14, %ymm11, %ymm13 + vpblendvb %ymm15, %ymm11, %ymm14, %ymm15 + vpcmpgtq %ymm3, %ymm6, %ymm11 + vpblendvb %ymm11, %ymm3, %ymm6, %ymm14 + vmovdqa %ymm14, 360(%rsp) + vpblendvb %ymm11, %ymm6, %ymm3, %ymm14 + vpcmpgtq %ymm7, %ymm8, %ymm3 + vmovdqa 296(%rsp), %ymm11 + vpblendvb %ymm3, %ymm7, %ymm8, %ymm6 + vpblendvb %ymm3, %ymm8, %ymm7, %ymm7 + vpcmpgtq %ymm10, %ymm12, %ymm3 + vpshufd $78, %ymm7, %ymm7 + vpblendvb %ymm3, %ymm10, %ymm12, %ymm8 + vpblendvb %ymm3, %ymm12, %ymm10, %ymm10 + vmovdqa 264(%rsp), %ymm12 + vpcmpgtq %ymm4, %ymm5, %ymm3 + vpshufd $78, %ymm10, %ymm10 + vmovdqa %ymm8, 328(%rsp) + vpblendvb %ymm3, %ymm4, %ymm5, %ymm8 + vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 + vpcmpgtq %ymm0, %ymm9, %ymm3 + vpshufd $78, %ymm8, %ymm8 + vpblendvb %ymm3, %ymm0, %ymm9, %ymm5 + vpblendvb %ymm3, %ymm9, %ymm0, %ymm0 + vpcmpgtq %ymm11, %ymm2, %ymm3 + vpshufd $78, %ymm5, %ymm5 + vpblendvb %ymm3, %ymm11, %ymm2, %ymm9 + vpblendvb %ymm3, %ymm2, %ymm11, %ymm2 + vpcmpgtq %ymm12, %ymm1, %ymm11 + vpshufd $78, %ymm9, %ymm9 + vpblendvb %ymm11, %ymm12, %ymm1, %ymm3 + vpblendvb %ymm11, %ymm1, %ymm12, %ymm1 + vpshufd $78, %ymm15, %ymm12 + vpcmpgtq %ymm13, %ymm9, %ymm15 + vpshufd $78, %ymm14, %ymm11 + vpshufd $78, %ymm3, %ymm3 + vpblendvb %ymm15, %ymm13, %ymm9, %ymm14 + vpblendvb %ymm15, %ymm9, %ymm13, %ymm15 + vpcmpgtq %ymm6, %ymm8, %ymm9 + vpblendvb %ymm9, %ymm6, %ymm8, %ymm13 + vpblendvb %ymm9, %ymm8, %ymm6, %ymm6 + vpcmpgtq %ymm2, %ymm12, %ymm8 + vpblendvb %ymm8, %ymm2, %ymm12, %ymm9 + vpblendvb %ymm8, %ymm12, %ymm2, %ymm2 + vpcmpgtq %ymm4, %ymm7, %ymm8 + vpshufd $78, %ymm2, %ymm2 + vmovdqa %ymm9, 296(%rsp) + vpblendvb %ymm8, %ymm4, %ymm7, %ymm9 + vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 + vmovdqa 360(%rsp), %ymm8 + vpshufd $78, %ymm9, %ymm9 + vpcmpgtq %ymm8, %ymm3, %ymm7 + vpblendvb %ymm7, %ymm8, %ymm3, %ymm12 + vpblendvb %ymm7, %ymm3, %ymm8, %ymm3 + vmovdqa 328(%rsp), %ymm8 + vmovdqa %ymm3, 360(%rsp) + vpcmpgtq %ymm8, %ymm5, %ymm7 + vpblendvb %ymm7, %ymm8, %ymm5, %ymm3 + vpblendvb %ymm7, %ymm5, %ymm8, %ymm5 + vpcmpgtq %ymm1, %ymm11, %ymm7 + vpblendvb %ymm7, %ymm1, %ymm11, %ymm8 + vpblendvb %ymm7, %ymm11, %ymm1, %ymm1 + vpcmpgtq %ymm0, %ymm10, %ymm11 + vpshufd $78, %ymm1, %ymm1 + vmovdqa %ymm8, 328(%rsp) + vpshufd $78, %ymm3, %ymm8 + vpshufd $78, 360(%rsp), %ymm3 + vpblendvb %ymm11, %ymm0, %ymm10, %ymm7 + vpblendvb %ymm11, %ymm10, %ymm0, %ymm0 + vpshufd $78, %ymm13, %ymm11 + vpshufd $78, %ymm15, %ymm10 + vpcmpgtq %ymm14, %ymm11, %ymm15 + vpshufd $78, %ymm7, %ymm7 + vpblendvb %ymm15, %ymm14, %ymm11, %ymm13 + vpblendvb %ymm15, %ymm11, %ymm14, %ymm14 + vmovdqa %ymm13, 360(%rsp) + vpcmpgtq %ymm6, %ymm10, %ymm11 + vmovdqa 296(%rsp), %ymm13 + vpblendvb %ymm11, %ymm6, %ymm10, %ymm15 + vpblendvb %ymm11, %ymm10, %ymm6, %ymm10 + vpcmpgtq %ymm13, %ymm9, %ymm6 + vpblendvb %ymm6, %ymm13, %ymm9, %ymm11 + vpblendvb %ymm6, %ymm9, %ymm13, %ymm9 + vpcmpgtq %ymm4, %ymm2, %ymm6 + vpblendvb %ymm6, %ymm4, %ymm2, %ymm13 + vpblendvb %ymm6, %ymm2, %ymm4, %ymm4 + vpcmpgtq %ymm12, %ymm8, %ymm2 + vpblendvb %ymm2, %ymm12, %ymm8, %ymm6 + vpblendvb %ymm2, %ymm8, %ymm12, %ymm8 + vmovdqa %ymm6, 136(%rsp) + vpcmpgtq %ymm5, %ymm3, %ymm2 + vpblendvb %ymm2, %ymm5, %ymm3, %ymm12 + vpblendvb %ymm2, %ymm3, %ymm5, %ymm3 + vmovdqa 328(%rsp), %ymm5 + vmovdqa %ymm12, 104(%rsp) + vpcmpgtq %ymm5, %ymm7, %ymm2 + vpblendvb %ymm2, %ymm5, %ymm7, %ymm6 + vpblendvb %ymm2, %ymm7, %ymm5, %ymm7 + vmovdqa 360(%rsp), %ymm5 + vpcmpgtq %ymm0, %ymm1, %ymm2 + vmovdqa %ymm6, 72(%rsp) + vpblendvb %ymm2, %ymm0, %ymm1, %ymm12 + vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 + vpshufd $78, %ymm5, %ymm1 + vpcmpgtq %ymm5, %ymm1, %ymm0 + vmovdqa %ymm12, 40(%rsp) + vmovdqa %ymm2, 8(%rsp) + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm1, %ymm5, %ymm6 + vpshufd $78, %ymm14, %ymm1 + vmovdqa %ymm6, 328(%rsp) + vpcmpgtq %ymm14, %ymm1, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm1, %ymm14, %ymm1 + vmovdqa %ymm1, -24(%rsp) + vmovdqa 104(%rsp), %ymm5 + cmpq $3, 192(%rsp) + vmovdqa %ymm1, 360(%rsp) + vpshufd $78, %ymm15, %ymm1 + vpcmpgtq %ymm15, %ymm1, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm1, %ymm15, %ymm15 + vpshufd $78, %ymm10, %ymm1 + vmovdqa %ymm15, 296(%rsp) + vpcmpgtq %ymm10, %ymm1, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm1, %ymm10, %ymm1 + vpshufd $78, %ymm8, %ymm10 + vmovdqa %ymm1, -56(%rsp) + vmovdqa %ymm1, 264(%rsp) + vpshufd $78, %ymm11, %ymm1 + vpcmpgtq %ymm11, %ymm1, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm1, %ymm11, %ymm11 + vpshufd $78, %ymm9, %ymm1 + vmovdqa %ymm11, 232(%rsp) + vpcmpgtq %ymm9, %ymm1, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 + vmovdqa 40(%rsp), %ymm9 + vmovdqa %ymm1, -88(%rsp) + vmovdqa %ymm1, 200(%rsp) + vpshufd $78, %ymm13, %ymm1 + vpcmpgtq %ymm13, %ymm1, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm1, %ymm13, %ymm12 + vpshufd $78, %ymm4, %ymm1 + vpcmpgtq %ymm4, %ymm1, %ymm0 + vmovdqa %ymm12, %ymm14 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 + vmovdqa 136(%rsp), %ymm4 + vmovdqa %ymm1, -120(%rsp) + vmovdqa %ymm1, %ymm13 + vpshufd $78, %ymm4, %ymm1 + vpcmpgtq %ymm4, %ymm1, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm1, %ymm4, %ymm4 + vpcmpgtq %ymm8, %ymm10, %ymm0 + vpshufd $78, %ymm5, %ymm1 + vpcmpgtq %ymm5, %ymm1, %ymm2 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpunpckhqdq %ymm2, %ymm2, %ymm2 + vpblendvb %ymm0, %ymm10, %ymm8, %ymm10 + vmovdqa 8(%rsp), %ymm8 + vpblendvb %ymm2, %ymm1, %ymm5, %ymm1 + vpshufd $78, %ymm3, %ymm5 + vpcmpgtq %ymm3, %ymm5, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm5, %ymm3, %ymm5 + vmovdqa 72(%rsp), %ymm0 + vpshufd $78, %ymm0, %ymm2 + vpcmpgtq %ymm0, %ymm2, %ymm3 + vpunpckhqdq %ymm3, %ymm3, %ymm3 + vpblendvb %ymm3, %ymm2, %ymm0, %ymm2 + vpshufd $78, %ymm7, %ymm3 + vpcmpgtq %ymm7, %ymm3, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm3, %ymm7, %ymm7 + vpshufd $78, %ymm9, %ymm3 + vpcmpgtq %ymm9, %ymm3, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm3, %ymm9, %ymm3 + vpshufd $78, %ymm8, %ymm9 + vpcmpgtq %ymm8, %ymm9, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm9, %ymm8, %ymm9 + jbe .L1487 + vpshufd $78, %ymm9, %ymm9 + vpshufd $78, %ymm5, %ymm5 + vpshufd $78, %ymm2, %ymm0 + vpermq $78, %ymm9, %ymm9 + vpermq $78, %ymm5, %ymm8 + vpshufd $78, %ymm3, %ymm3 + vmovdqa -24(%rsp), %ymm5 + vpcmpgtq %ymm6, %ymm9, %ymm2 + vpermq $78, %ymm3, %ymm3 + vpshufd $78, %ymm7, %ymm7 + vpermq $78, %ymm7, %ymm7 + vpermq $78, %ymm0, %ymm0 + vpshufd $78, %ymm1, %ymm1 + vpermq $78, %ymm1, %ymm1 + vpshufd $78, %ymm10, %ymm10 + vpshufd $78, %ymm4, %ymm4 + vpblendvb %ymm2, %ymm6, %ymm9, %ymm14 + vpblendvb %ymm2, %ymm9, %ymm6, %ymm13 + vmovdqa -56(%rsp), %ymm9 + vpermq $78, %ymm10, %ymm10 + vpcmpgtq %ymm5, %ymm3, %ymm2 + vpermq $78, %ymm4, %ymm4 + vpblendvb %ymm2, %ymm5, %ymm3, %ymm6 + vpblendvb %ymm2, %ymm3, %ymm5, %ymm3 + vpcmpgtq %ymm15, %ymm7, %ymm2 + vpshufd $78, %ymm3, %ymm3 + vpermq $78, %ymm3, %ymm3 + vpblendvb %ymm2, %ymm15, %ymm7, %ymm5 + vpblendvb %ymm2, %ymm7, %ymm15, %ymm7 + vpcmpgtq %ymm9, %ymm0, %ymm2 + vpshufd $78, %ymm7, %ymm7 + vpermq $78, %ymm7, %ymm7 + vpblendvb %ymm2, %ymm9, %ymm0, %ymm15 + vpblendvb %ymm2, %ymm0, %ymm9, %ymm0 + vpcmpgtq %ymm11, %ymm8, %ymm9 + vpshufd $78, %ymm0, %ymm0 + vmovdqa %ymm15, 360(%rsp) + vmovdqa -120(%rsp), %ymm15 + vpermq $78, %ymm0, %ymm0 + vpblendvb %ymm9, %ymm11, %ymm8, %ymm2 + vpblendvb %ymm9, %ymm8, %ymm11, %ymm11 + vmovdqa -88(%rsp), %ymm9 + vmovdqa %ymm11, 328(%rsp) + vpshufd $78, %ymm2, %ymm2 + vpcmpgtq %ymm9, %ymm1, %ymm8 + vpermq $78, %ymm2, %ymm2 + vpblendvb %ymm8, %ymm9, %ymm1, %ymm11 + vpblendvb %ymm8, %ymm1, %ymm9, %ymm1 + vpcmpgtq %ymm12, %ymm10, %ymm9 + vpshufd $78, %ymm11, %ymm11 + vpermq $78, %ymm11, %ymm11 + vpblendvb %ymm9, %ymm12, %ymm10, %ymm8 + vpblendvb %ymm9, %ymm10, %ymm12, %ymm10 + vpcmpgtq %ymm15, %ymm4, %ymm9 + vpshufd $78, %ymm8, %ymm8 + vpermq $78, %ymm8, %ymm8 + vpblendvb %ymm9, %ymm15, %ymm4, %ymm12 + vpblendvb %ymm9, %ymm4, %ymm15, %ymm4 + vpshufd $78, %ymm13, %ymm9 + vpshufd $78, %ymm12, %ymm12 + vpermq $78, %ymm9, %ymm9 + vpermq $78, %ymm12, %ymm12 + vpcmpgtq %ymm14, %ymm12, %ymm15 + vpblendvb %ymm15, %ymm14, %ymm12, %ymm13 + vpblendvb %ymm15, %ymm12, %ymm14, %ymm15 + vpcmpgtq %ymm4, %ymm9, %ymm12 + vpblendvb %ymm12, %ymm4, %ymm9, %ymm14 + vmovdqa %ymm14, 296(%rsp) + vpblendvb %ymm12, %ymm9, %ymm4, %ymm14 + vpcmpgtq %ymm6, %ymm8, %ymm4 + vpblendvb %ymm4, %ymm6, %ymm8, %ymm9 + vpblendvb %ymm4, %ymm8, %ymm6, %ymm6 + vpcmpgtq %ymm10, %ymm3, %ymm4 + vpshufd $78, %ymm6, %ymm6 + vpermq $78, %ymm6, %ymm6 + vpblendvb %ymm4, %ymm10, %ymm3, %ymm12 + vpblendvb %ymm4, %ymm3, %ymm10, %ymm10 + vpcmpgtq %ymm5, %ymm11, %ymm3 + vpshufd $78, %ymm10, %ymm10 + vmovdqa %ymm12, 264(%rsp) + vpermq $78, %ymm10, %ymm10 + vpblendvb %ymm3, %ymm5, %ymm11, %ymm8 + vpblendvb %ymm3, %ymm11, %ymm5, %ymm5 + vmovdqa 328(%rsp), %ymm11 + vpcmpgtq %ymm1, %ymm7, %ymm3 + vpshufd $78, %ymm8, %ymm8 + vpermq $78, %ymm8, %ymm8 + vpblendvb %ymm3, %ymm1, %ymm7, %ymm4 + vpblendvb %ymm3, %ymm7, %ymm1, %ymm1 + vmovdqa 360(%rsp), %ymm7 + vpshufd $78, %ymm4, %ymm4 + vpcmpgtq %ymm7, %ymm2, %ymm3 + vpermq $78, %ymm4, %ymm4 + vpblendvb %ymm3, %ymm7, %ymm2, %ymm12 + vpblendvb %ymm3, %ymm2, %ymm7, %ymm2 + vpcmpgtq %ymm11, %ymm0, %ymm7 + vpshufd $78, %ymm12, %ymm12 + vpermq $78, %ymm12, %ymm12 + vpblendvb %ymm7, %ymm11, %ymm0, %ymm3 + vpblendvb %ymm7, %ymm0, %ymm11, %ymm0 + vpshufd $78, %ymm15, %ymm7 + vpcmpgtq %ymm13, %ymm12, %ymm15 + vpshufd $78, %ymm14, %ymm11 + vpermq $78, %ymm7, %ymm7 + vpshufd $78, %ymm3, %ymm3 + vpermq $78, %ymm11, %ymm11 + vpermq $78, %ymm3, %ymm3 + vpblendvb %ymm15, %ymm13, %ymm12, %ymm14 + vpblendvb %ymm15, %ymm12, %ymm13, %ymm15 + vpcmpgtq %ymm9, %ymm8, %ymm12 + vpblendvb %ymm12, %ymm9, %ymm8, %ymm13 + vpblendvb %ymm12, %ymm8, %ymm9, %ymm9 + vmovdqa %ymm13, 360(%rsp) + vpcmpgtq %ymm2, %ymm7, %ymm8 + vpblendvb %ymm8, %ymm2, %ymm7, %ymm13 + vpblendvb %ymm8, %ymm7, %ymm2, %ymm2 + vpcmpgtq %ymm5, %ymm6, %ymm7 + vpshufd $78, %ymm2, %ymm2 + vpermq $78, %ymm2, %ymm2 + vpblendvb %ymm7, %ymm5, %ymm6, %ymm8 + vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 + vmovdqa 296(%rsp), %ymm7 + vpshufd $78, %ymm8, %ymm8 + vpcmpgtq %ymm7, %ymm3, %ymm6 + vpermq $78, %ymm8, %ymm8 + vpblendvb %ymm6, %ymm7, %ymm3, %ymm12 + vpblendvb %ymm6, %ymm3, %ymm7, %ymm3 + vmovdqa %ymm12, 328(%rsp) + vpshufd $78, %ymm3, %ymm3 + vmovdqa 264(%rsp), %ymm12 + vpermq $78, %ymm3, %ymm3 + vpcmpgtq %ymm12, %ymm4, %ymm6 + vpblendvb %ymm6, %ymm12, %ymm4, %ymm7 + vpblendvb %ymm6, %ymm4, %ymm12, %ymm4 + vpcmpgtq %ymm0, %ymm11, %ymm6 + vpshufd $78, %ymm7, %ymm7 + vpermq $78, %ymm7, %ymm7 + vpblendvb %ymm6, %ymm0, %ymm11, %ymm12 + vpblendvb %ymm6, %ymm11, %ymm0, %ymm0 + vpcmpgtq %ymm1, %ymm10, %ymm11 + vpshufd $78, %ymm0, %ymm0 + vmovdqa %ymm12, 296(%rsp) + vpermq $78, %ymm0, %ymm0 + vpblendvb %ymm11, %ymm1, %ymm10, %ymm6 + vpblendvb %ymm11, %ymm10, %ymm1, %ymm1 + vpshufd $78, 360(%rsp), %ymm11 + vpermq $78, %ymm11, %ymm11 + vpshufd $78, %ymm15, %ymm10 + vpcmpgtq %ymm14, %ymm11, %ymm15 + vpshufd $78, %ymm6, %ymm6 + vpermq $78, %ymm10, %ymm10 + vpermq $78, %ymm6, %ymm6 + vpblendvb %ymm15, %ymm14, %ymm11, %ymm12 + vpblendvb %ymm15, %ymm11, %ymm14, %ymm14 + vmovdqa %ymm12, 360(%rsp) + vpcmpgtq %ymm9, %ymm10, %ymm11 + vmovdqa 328(%rsp), %ymm12 + vpblendvb %ymm11, %ymm9, %ymm10, %ymm15 + vpblendvb %ymm11, %ymm10, %ymm9, %ymm10 + vpcmpgtq %ymm13, %ymm8, %ymm9 + vpblendvb %ymm9, %ymm13, %ymm8, %ymm11 + vpblendvb %ymm9, %ymm8, %ymm13, %ymm9 + vpcmpgtq %ymm5, %ymm2, %ymm8 + vpblendvb %ymm8, %ymm5, %ymm2, %ymm13 + vpblendvb %ymm8, %ymm2, %ymm5, %ymm5 + vpcmpgtq %ymm12, %ymm7, %ymm2 + vpblendvb %ymm2, %ymm12, %ymm7, %ymm8 + vpblendvb %ymm2, %ymm7, %ymm12, %ymm7 + vpcmpgtq %ymm4, %ymm3, %ymm2 + vpblendvb %ymm2, %ymm4, %ymm3, %ymm12 + vpblendvb %ymm2, %ymm3, %ymm4, %ymm3 + vmovdqa %ymm12, 328(%rsp) + vmovdqa 296(%rsp), %ymm12 + vpcmpgtq %ymm12, %ymm6, %ymm2 + vpblendvb %ymm2, %ymm12, %ymm6, %ymm4 + vpblendvb %ymm2, %ymm6, %ymm12, %ymm6 + vpcmpgtq %ymm1, %ymm0, %ymm2 + vpblendvb %ymm2, %ymm1, %ymm0, %ymm12 + vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 + vmovdqa %ymm12, 296(%rsp) + vmovdqa 360(%rsp), %ymm12 + vmovdqa %ymm2, 264(%rsp) + vpshufd $78, %ymm12, %ymm0 + vpermq $78, %ymm0, %ymm0 + vpcmpgtq %ymm12, %ymm0, %ymm1 + vpblendvb %ymm1, %ymm12, %ymm0, %ymm2 + vpblendvb %ymm1, %ymm0, %ymm12, %ymm12 + vpblendd $15, %ymm2, %ymm12, %ymm0 + vmovdqa %ymm0, 360(%rsp) + vpshufd $78, %ymm14, %ymm0 + vpermq $78, %ymm0, %ymm0 + vpcmpgtq %ymm14, %ymm0, %ymm1 + vpblendvb %ymm1, %ymm14, %ymm0, %ymm2 + vpblendvb %ymm1, %ymm0, %ymm14, %ymm14 + vpshufd $78, %ymm15, %ymm0 + vpermq $78, %ymm0, %ymm0 + vpblendd $15, %ymm2, %ymm14, %ymm14 + vpcmpgtq %ymm15, %ymm0, %ymm1 + vpblendvb %ymm1, %ymm15, %ymm0, %ymm2 + vpblendvb %ymm1, %ymm0, %ymm15, %ymm15 + vpshufd $78, %ymm10, %ymm0 + vpblendd $15, %ymm2, %ymm15, %ymm1 + vpermq $78, %ymm0, %ymm0 + vmovdqa 328(%rsp), %ymm15 + vmovdqa %ymm1, 232(%rsp) + vpcmpgtq %ymm10, %ymm0, %ymm1 + vpblendvb %ymm1, %ymm10, %ymm0, %ymm2 + vpblendvb %ymm1, %ymm0, %ymm10, %ymm10 + vpshufd $78, %ymm11, %ymm0 + vpermq $78, %ymm0, %ymm0 + vpblendd $15, %ymm2, %ymm10, %ymm10 + vpcmpgtq %ymm11, %ymm0, %ymm1 + vpblendvb %ymm1, %ymm11, %ymm0, %ymm2 + vpblendvb %ymm1, %ymm0, %ymm11, %ymm11 + vpshufd $78, %ymm9, %ymm0 + vpermq $78, %ymm0, %ymm0 + vpblendd $15, %ymm2, %ymm11, %ymm2 + vpcmpgtq %ymm9, %ymm0, %ymm1 + vpblendvb %ymm1, %ymm9, %ymm0, %ymm11 + vpblendvb %ymm1, %ymm0, %ymm9, %ymm9 + vpshufd $78, %ymm13, %ymm1 + vpermq $78, %ymm1, %ymm1 + vpblendd $15, %ymm11, %ymm9, %ymm0 + vmovdqa %ymm0, 200(%rsp) + vpcmpgtq %ymm13, %ymm1, %ymm9 + vmovdqa 264(%rsp), %ymm0 + vpblendvb %ymm9, %ymm13, %ymm1, %ymm11 + vpblendvb %ymm9, %ymm1, %ymm13, %ymm13 + vpshufd $78, %ymm5, %ymm1 + vpermq $78, %ymm1, %ymm1 + vpblendd $15, %ymm11, %ymm13, %ymm13 + vpcmpgtq %ymm5, %ymm1, %ymm9 + vpblendvb %ymm9, %ymm5, %ymm1, %ymm11 + vpblendvb %ymm9, %ymm1, %ymm5, %ymm5 + vpshufd $78, %ymm8, %ymm1 + vpermq $78, %ymm1, %ymm1 + vpblendd $15, %ymm11, %ymm5, %ymm5 + vpcmpgtq %ymm8, %ymm1, %ymm9 + vpblendvb %ymm9, %ymm8, %ymm1, %ymm11 + vpblendvb %ymm9, %ymm1, %ymm8, %ymm8 + vpshufd $78, %ymm7, %ymm1 + vpermq $78, %ymm1, %ymm1 + vpblendd $15, %ymm11, %ymm8, %ymm8 + vpcmpgtq %ymm7, %ymm1, %ymm9 + vpblendvb %ymm9, %ymm7, %ymm1, %ymm11 + vpblendvb %ymm9, %ymm1, %ymm7, %ymm7 + vpshufd $78, %ymm15, %ymm1 + vpermq $78, %ymm1, %ymm1 + vpblendd $15, %ymm11, %ymm7, %ymm7 + vpcmpgtq %ymm15, %ymm1, %ymm9 + vpblendvb %ymm9, %ymm15, %ymm1, %ymm11 + vpblendvb %ymm9, %ymm1, %ymm15, %ymm9 + vpshufd $78, %ymm3, %ymm1 + vpermq $78, %ymm1, %ymm1 + vpblendd $15, %ymm11, %ymm9, %ymm9 + vpcmpgtq %ymm3, %ymm1, %ymm11 + vpblendvb %ymm11, %ymm3, %ymm1, %ymm12 + vpblendvb %ymm11, %ymm1, %ymm3, %ymm3 + vpshufd $78, %ymm4, %ymm1 + vpermq $78, %ymm1, %ymm1 + vpblendd $15, %ymm12, %ymm3, %ymm3 + vpcmpgtq %ymm4, %ymm1, %ymm11 + vpblendvb %ymm11, %ymm4, %ymm1, %ymm12 + vpblendvb %ymm11, %ymm1, %ymm4, %ymm4 + vpshufd $78, %ymm6, %ymm1 + vpermq $78, %ymm1, %ymm1 + vpblendd $15, %ymm12, %ymm4, %ymm4 + vmovdqa 296(%rsp), %ymm12 + vpcmpgtq %ymm6, %ymm1, %ymm15 + vpblendvb %ymm15, %ymm6, %ymm1, %ymm11 + vpblendvb %ymm15, %ymm1, %ymm6, %ymm6 + vpshufd $78, %ymm12, %ymm1 + vpermq $78, %ymm1, %ymm1 + vpblendd $15, %ymm11, %ymm6, %ymm15 + vpcmpgtq %ymm12, %ymm1, %ymm11 + vpblendvb %ymm11, %ymm12, %ymm1, %ymm6 + vpblendvb %ymm11, %ymm1, %ymm12, %ymm11 + vpshufd $78, %ymm0, %ymm1 + vpermq $78, %ymm1, %ymm1 + vpblendd $15, %ymm6, %ymm11, %ymm11 + vpcmpgtq %ymm0, %ymm1, %ymm6 + vpblendvb %ymm6, %ymm0, %ymm1, %ymm12 + vpblendvb %ymm6, %ymm1, %ymm0, %ymm1 + vmovdqa 360(%rsp), %ymm0 + vpblendd $15, %ymm12, %ymm1, %ymm1 + vpshufd $78, %ymm0, %ymm12 + vpcmpgtq %ymm0, %ymm12, %ymm6 + vpunpckhqdq %ymm6, %ymm6, %ymm6 + vpblendvb %ymm6, %ymm12, %ymm0, %ymm6 + vpshufd $78, %ymm14, %ymm12 + vmovdqa %ymm6, 328(%rsp) + vpcmpgtq %ymm14, %ymm12, %ymm6 + vpunpckhqdq %ymm6, %ymm6, %ymm6 + vpblendvb %ymm6, %ymm12, %ymm14, %ymm6 + vmovdqa %ymm6, 360(%rsp) + vmovdqa 232(%rsp), %ymm0 + vpshufd $78, %ymm0, %ymm12 + vpcmpgtq %ymm0, %ymm12, %ymm6 + vpunpckhqdq %ymm6, %ymm6, %ymm6 + vpblendvb %ymm6, %ymm12, %ymm0, %ymm6 + vpshufd $78, %ymm10, %ymm12 + vmovdqa 200(%rsp), %ymm0 + vmovdqa %ymm6, 296(%rsp) + vpcmpgtq %ymm10, %ymm12, %ymm6 + vpunpckhqdq %ymm6, %ymm6, %ymm6 + vpblendvb %ymm6, %ymm12, %ymm10, %ymm6 + vpshufd $78, %ymm2, %ymm10 + vpshufd $78, %ymm8, %ymm12 + vmovdqa %ymm6, 264(%rsp) + vpcmpgtq %ymm2, %ymm10, %ymm6 + vpunpckhqdq %ymm6, %ymm6, %ymm6 + vpblendvb %ymm6, %ymm10, %ymm2, %ymm6 + vmovdqa %ymm6, 232(%rsp) + vpshufd $78, %ymm0, %ymm6 + vpcmpgtq %ymm0, %ymm6, %ymm2 + vpunpckhqdq %ymm2, %ymm2, %ymm2 + vpblendvb %ymm2, %ymm6, %ymm0, %ymm6 + vpshufd $78, %ymm13, %ymm2 + vmovdqa %ymm6, 200(%rsp) + vpcmpgtq %ymm13, %ymm2, %ymm0 + vpshufd $78, %ymm7, %ymm6 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm2, %ymm13, %ymm14 + vpshufd $78, %ymm5, %ymm2 + vpcmpgtq %ymm5, %ymm2, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm2, %ymm5, %ymm13 + vpcmpgtq %ymm8, %ymm12, %ymm0 + vpshufd $78, %ymm3, %ymm5 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm12, %ymm8, %ymm12 + vpcmpgtq %ymm7, %ymm6, %ymm0 + vpunpckhqdq %ymm0, %ymm0, %ymm0 + vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 + vpshufd $78, %ymm9, %ymm0 + vpshufd $78, %ymm15, %ymm7 + vpcmpgtq %ymm9, %ymm0, %ymm2 + vpunpckhqdq %ymm2, %ymm2, %ymm2 + vpblendvb %ymm2, %ymm0, %ymm9, %ymm0 + vpcmpgtq %ymm3, %ymm5, %ymm2 + vpshufd $78, %ymm1, %ymm9 + vpunpckhqdq %ymm2, %ymm2, %ymm2 + vpblendvb %ymm2, %ymm5, %ymm3, %ymm5 + vpshufd $78, %ymm4, %ymm2 + vpcmpgtq %ymm4, %ymm2, %ymm3 + vpunpckhqdq %ymm3, %ymm3, %ymm3 + vpblendvb %ymm3, %ymm2, %ymm4, %ymm2 + vpcmpgtq %ymm15, %ymm7, %ymm3 + vpcmpgtq %ymm1, %ymm9, %ymm4 + vpunpckhqdq %ymm3, %ymm3, %ymm3 + vpblendvb %ymm3, %ymm7, %ymm15, %ymm7 + vpshufd $78, %ymm11, %ymm3 + vpunpckhqdq %ymm4, %ymm4, %ymm4 + vpcmpgtq %ymm11, %ymm3, %ymm8 + vpblendvb %ymm4, %ymm9, %ymm1, %ymm9 + vpunpckhqdq %ymm8, %ymm8, %ymm8 + vpblendvb %ymm8, %ymm3, %ymm11, %ymm3 +.L1483: + vmovdqa 328(%rsp), %ymm4 + vmovdqu %ymm4, (%rdx) + movq 184(%rsp), %rdx + vmovdqa 360(%rsp), %ymm4 + vmovdqu %ymm4, (%r15) + vmovdqa 296(%rsp), %ymm4 + vmovdqu %ymm4, (%r14) + vmovdqa 264(%rsp), %ymm4 + vmovdqu %ymm4, 0(%r13) + vmovdqa 232(%rsp), %ymm4 + vmovdqu %ymm4, (%r12) + vmovdqa 200(%rsp), %ymm4 + vmovdqu %ymm4, (%rbx) + movq 176(%rsp), %rbx + vmovdqu %ymm14, (%r11) + vmovdqu %ymm13, (%r10) + vmovdqu %ymm12, (%r9) + vmovdqu %ymm6, (%r8) + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm5, (%rsi) + vmovdqu %ymm2, (%rdx) + vmovdqu %ymm7, (%rbx) + vmovdqu %ymm3, (%rcx) + vmovdqu %ymm9, (%rax) + vzeroupper + leaq -40(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1486: + .cfi_restore_state + vmovdqa %ymm3, %ymm13 + vmovdqa 72(%rsp), %ymm7 + vmovdqa 104(%rsp), %ymm3 + vmovdqa %ymm10, %ymm14 + vmovdqa 136(%rsp), %ymm9 + jmp .L1483 + .p2align 4,,10 + .p2align 3 +.L1487: + vmovdqa %ymm4, %ymm12 + vmovdqa %ymm10, %ymm6 + vmovdqa %ymm1, %ymm0 + jmp .L1483 + .cfi_endproc +.LFE18807: + .size _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0, .-_ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + .section .text._ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0,"ax",@progbits + .p2align 4 + .type _ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0, @function +_ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0: +.LFB18808: + .cfi_startproc + leaq 8(%rsp), %r10 + .cfi_def_cfa 10, 0 + andq $-32, %rsp + pushq -8(%r10) + pushq %rbp + movq %rsp, %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + pushq %r15 + pushq %r14 + pushq %r13 + .cfi_escape 0x10,0xf,0x2,0x76,0x78 + .cfi_escape 0x10,0xe,0x2,0x76,0x70 + .cfi_escape 0x10,0xd,0x2,0x76,0x68 + movq %rdi, %r13 + pushq %r12 + .cfi_escape 0x10,0xc,0x2,0x76,0x60 + movq %rcx, %r12 + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x58,0x6 + pushq %rbx + addq $-128, %rsp + .cfi_escape 0x10,0x3,0x2,0x76,0x50 + movq %rsi, -104(%rbp) + movq %rdx, -88(%rbp) + movq %r9, -96(%rbp) + cmpq $64, %rdx + jbe .L1654 + movq %rdi, %rax + movq %rdi, -112(%rbp) + movq %r8, %rbx + shrq $3, %rax + movq %rax, %rdx + movq %rax, -144(%rbp) + andl $7, %edx + jne .L1655 + movq -88(%rbp), %r14 + movq %rdi, %rax +.L1501: + movq 8(%rbx), %rcx + movq 16(%rbx), %r10 + movq %rcx, %rdi + leaq 1(%r10), %r8 + leaq (%rcx,%rcx,8), %rsi + xorq (%rbx), %r8 + shrq $11, %rdi + rorx $40, %rcx, %rdx + leaq 2(%r10), %rcx + addq %r8, %rdx + xorq %rdi, %rsi + movq %rdx, %r9 + rorx $40, %rdx, %rdi + xorq %rcx, %rsi + shrq $11, %r9 + leaq (%rdx,%rdx,8), %rcx + leaq 3(%r10), %rdx + addq %rsi, %rdi + xorq %r9, %rcx + movq %rdi, %r9 + xorq %rdx, %rcx + leaq (%rdi,%rdi,8), %rdx + rorx $40, %rdi, %r11 + shrq $11, %r9 + addq %rcx, %r11 + leaq 4(%r10), %rdi + addq $5, %r10 + xorq %r9, %rdx + movq %r11, %r15 + rorx $40, %r11, %r9 + movq %r10, 16(%rbx) + xorq %rdi, %rdx + shrq $11, %r15 + leaq (%r11,%r11,8), %rdi + addq %rdx, %r9 + xorq %r15, %rdi + movq %r9, %r15 + leaq (%r9,%r9,8), %r11 + xorq %r10, %rdi + rorx $40, %r9, %r9 + shrq $11, %r15 + addq %rdi, %r9 + movl %edi, %edi + movabsq $34359738359, %r10 + xorq %r15, %r11 + movl %edx, %r15d + vmovq %r11, %xmm7 + movl %r8d, %r11d + vpinsrq $1, %r9, %xmm7, %xmm0 + movq %r14, %r9 + shrq $3, %r9 + cmpq %r10, %r14 + movl $4294967295, %r10d + movl %ecx, %r14d + cmova %r10, %r9 + shrq $32, %r8 + movl %esi, %r10d + vmovdqu %xmm0, (%rbx) + shrq $32, %rsi + imulq %r9, %r11 + shrq $32, %rcx + shrq $32, %rdx + imulq %r9, %r8 + imulq %r9, %r10 + shrq $32, %r11 + imulq %r9, %rsi + imulq %r9, %r14 + shrq $32, %r8 + imulq %r9, %rcx + shrq $32, %r10 + imulq %r9, %r15 + shrq $32, %rsi + imulq %r9, %rdx + shrq $32, %r14 + imulq %r9, %rdi + movq %r11, %r9 + shrq $32, %rcx + salq $6, %r9 + shrq $32, %r15 + vmovdqa (%rax,%r9), %ymm2 + movq %r8, %r9 + shrq $32, %rdx + leaq 4(,%r8,8), %r8 + salq $6, %r9 + shrq $32, %rdi + vmovdqa (%rax,%r9), %ymm3 + movq %r10, %r9 + salq $6, %r9 + vmovdqa (%rax,%r9), %ymm1 + movq %rsi, %r9 + leaq 4(,%rsi,8), %rsi + salq $6, %r9 + vpcmpgtq %ymm2, %ymm1, %ymm4 + vmovdqa (%rax,%r9), %ymm6 + movq %r14, %r9 + salq $6, %r9 + vpblendvb %ymm4, %ymm2, %ymm1, %ymm0 + vpblendvb %ymm4, %ymm1, %ymm2, %ymm2 + vpcmpgtq %ymm0, %ymm3, %ymm1 + vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 + vmovdqa (%rax,%r9), %ymm3 + movq %rcx, %r9 + leaq 4(,%rcx,8), %rcx + vpcmpgtq %ymm0, %ymm2, %ymm1 + salq $6, %r9 + vpblendvb %ymm1, %ymm0, %ymm2, %ymm2 + vmovdqa (%rax,%r9), %ymm1 + movq %r15, %r9 + salq $6, %r9 + vmovdqa %ymm2, (%r12) + vpcmpgtq %ymm6, %ymm1, %ymm4 + vpblendvb %ymm4, %ymm6, %ymm1, %ymm0 + vpblendvb %ymm4, %ymm1, %ymm6, %ymm6 + vmovdqa (%rax,%r9), %ymm4 + movq %rdx, %r9 + vpcmpgtq %ymm0, %ymm3, %ymm1 + salq $6, %r9 + leaq 4(,%rdx,8), %rdx + vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 + vmovdqa (%rax,%r9), %ymm3 + movq %rdi, %r9 + vpcmpgtq %ymm0, %ymm6, %ymm1 + salq $6, %r9 + vpblendvb %ymm1, %ymm0, %ymm6, %ymm6 + vmovdqa (%rax,%r9), %ymm1 + leaq 4(,%r11,8), %r9 + vmovdqa %ymm6, 64(%r12) + vpcmpgtq %ymm4, %ymm1, %ymm5 + vpblendvb %ymm5, %ymm4, %ymm1, %ymm0 + vpblendvb %ymm5, %ymm1, %ymm4, %ymm4 + vmovdqa (%rax,%r8,8), %ymm5 + leaq 4(,%r10,8), %r8 + vpcmpgtq %ymm0, %ymm3, %ymm1 + vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 + vmovdqa (%rax,%r8,8), %ymm3 + vpcmpgtq %ymm0, %ymm4, %ymm1 + vpblendvb %ymm1, %ymm0, %ymm4, %ymm4 + vmovdqa (%rax,%r9,8), %ymm0 + vmovdqa %ymm4, 128(%r12) + vpcmpgtq %ymm0, %ymm3, %ymm7 + vpblendvb %ymm7, %ymm0, %ymm3, %ymm1 + vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 + vpcmpgtq %ymm1, %ymm5, %ymm3 + vpblendvb %ymm3, %ymm5, %ymm1, %ymm1 + vmovdqa (%rax,%rcx,8), %ymm5 + leaq 4(,%r15,8), %rcx + vpcmpgtq %ymm1, %ymm0, %ymm3 + vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 + vmovdqa (%rax,%rsi,8), %ymm1 + leaq 4(,%r14,8), %rsi + leaq 192(%r12), %r14 + vmovdqa (%rax,%rsi,8), %ymm7 + vmovdqa %ymm0, 32(%r12) + vpcmpgtq %ymm1, %ymm5, %ymm8 + vpblendvb %ymm8, %ymm1, %ymm5, %ymm3 + vpblendvb %ymm8, %ymm5, %ymm1, %ymm1 + vmovdqa (%rax,%rdx,8), %ymm8 + leaq 4(,%rdi,8), %rdx + vpcmpgtq %ymm3, %ymm7, %ymm5 + vpblendvb %ymm5, %ymm7, %ymm3, %ymm3 + vmovdqa (%rax,%rdx,8), %ymm7 + vpcmpgtq %ymm3, %ymm1, %ymm5 + vpblendvb %ymm5, %ymm3, %ymm1, %ymm5 + vmovdqa (%rax,%rcx,8), %ymm1 + vmovdqa %ymm5, 96(%r12) + vpcmpgtq %ymm1, %ymm7, %ymm9 + vpblendvb %ymm9, %ymm1, %ymm7, %ymm3 + vpblendvb %ymm9, %ymm7, %ymm1, %ymm1 + vpcmpgtq %ymm3, %ymm8, %ymm7 + vpblendvb %ymm7, %ymm8, %ymm3, %ymm3 + vpcmpgtq %ymm3, %ymm1, %ymm7 + vpblendvb %ymm7, %ymm3, %ymm1, %ymm1 + vpbroadcastq %xmm2, %ymm3 + vpxor %ymm2, %ymm3, %ymm2 + vpxor %ymm0, %ymm3, %ymm0 + vpxor %ymm6, %ymm3, %ymm6 + vmovdqa %ymm1, 160(%r12) + vpor %ymm2, %ymm0, %ymm0 + vpxor %ymm5, %ymm3, %ymm5 + vpxor %ymm4, %ymm3, %ymm4 + vpor %ymm6, %ymm0, %ymm0 + vpxor %ymm1, %ymm3, %ymm1 + vpxor %xmm2, %xmm2, %xmm2 + vpor %ymm5, %ymm0, %ymm0 + vpor %ymm4, %ymm0, %ymm0 + vpor %ymm1, %ymm0, %ymm0 + vpxor 192(%r12), %ymm3, %ymm1 + vpor %ymm0, %ymm1, %ymm1 + vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 + vpxor %xmm1, %xmm1, %xmm1 + vpcmpeqq %ymm1, %ymm0, %ymm0 + vmovmskpd %ymm0, %eax + cmpl $15, %eax + je .L1503 + vmovdqa .LC13(%rip), %ymm0 + movl $2, %esi + movq %r12, %rdi + vmovdqu %ymm0, 192(%r12) + vmovdqu %ymm0, 224(%r12) + vmovdqu %ymm0, 256(%r12) + vzeroupper + call _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + vpbroadcastq (%r12), %ymm2 + vpcmpeqd %ymm0, %ymm0, %ymm0 + vpbroadcastq 184(%r12), %ymm1 + vpaddq %ymm0, %ymm1, %ymm0 + vpcmpeqq %ymm2, %ymm0, %ymm0 + vmovmskpd %ymm0, %eax + cmpl $15, %eax + jne .L1505 + movq -88(%rbp), %rsi + leaq -80(%rbp), %rdx + movq %r14, %rcx + vmovdqa %ymm2, %ymm0 + movq %r13, %rdi + call _ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L1649 +.L1505: + movq 96(%r12), %rdx + cmpq %rdx, 88(%r12) + jne .L1585 + cmpq 80(%r12), %rdx + jne .L1540 + cmpq 72(%r12), %rdx + jne .L1586 + cmpq 64(%r12), %rdx + jne .L1587 + cmpq 56(%r12), %rdx + jne .L1588 + cmpq 48(%r12), %rdx + jne .L1589 + cmpq 40(%r12), %rdx + jne .L1590 + cmpq 32(%r12), %rdx + jne .L1591 + cmpq 24(%r12), %rdx + jne .L1592 + cmpq 16(%r12), %rdx + jne .L1593 + cmpq 8(%r12), %rdx + jne .L1594 + movq (%r12), %rax + cmpq %rax, %rdx + jne .L1656 +.L1542: + vmovq %rax, %xmm7 + vpbroadcastq %xmm7, %ymm0 +.L1653: + movl $1, -112(%rbp) +.L1538: + cmpq $0, -96(%rbp) + je .L1657 + movq -88(%rbp), %rax + leaq -4(%rax), %r10 + movq %r10, %rsi + movq %r10, %rdx + vmovdqu 0(%r13,%r10,8), %ymm4 + andl $15, %esi + andl $12, %edx + je .L1596 + vmovdqu 0(%r13), %ymm1 + vpcmpeqd %ymm2, %ymm2, %ymm2 + leaq _ZZN3hwy6N_AVX26detail18IndicesFromBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices(%rip), %r8 + xorl %ecx, %ecx + vpcmpgtq %ymm0, %ymm1, %ymm5 + vpxor %ymm2, %ymm5, %ymm3 + vmovmskpd %ymm3, %eax + movq %rax, %rdx + popcntq %rax, %rax + leaq 0(%r13,%rax,8), %rax + salq $5, %rdx + vmovdqa (%r8,%rdx), %ymm3 + vmovmskpd %ymm5, %edx + popcntq %rdx, %rcx + salq $5, %rdx + vpslld $28, %ymm3, %ymm6 + vpermd %ymm1, %ymm3, %ymm3 + vpmaskmovq %ymm3, %ymm6, 0(%r13) + vmovdqa (%r8,%rdx), %ymm3 + vpermd %ymm1, %ymm3, %ymm1 + vmovdqu %ymm1, (%r12) + testb $8, %r10b + je .L1548 + vmovdqu 32(%r13), %ymm1 + vpcmpgtq %ymm0, %ymm1, %ymm5 + vpxor %ymm2, %ymm5, %ymm3 + vmovmskpd %ymm3, %edx + movq %rdx, %rdi + popcntq %rdx, %rdx + salq $5, %rdi + vmovdqa (%r8,%rdi), %ymm3 + vpslld $28, %ymm3, %ymm6 + vpermd %ymm1, %ymm3, %ymm3 + vpmaskmovq %ymm3, %ymm6, (%rax) + leaq (%rax,%rdx,8), %rax + vmovmskpd %ymm5, %edx + movq %rdx, %rdi + popcntq %rdx, %rdx + salq $5, %rdi + vmovdqa (%r8,%rdi), %ymm3 + vpermd %ymm1, %ymm3, %ymm1 + vmovdqu %ymm1, (%r12,%rcx,8) + addq %rdx, %rcx + cmpq $11, %rsi + jbe .L1548 + vmovdqu 64(%r13), %ymm1 + vpcmpgtq %ymm0, %ymm1, %ymm3 + vpxor %ymm2, %ymm3, %ymm2 + vmovmskpd %ymm2, %edx + movq %rdx, %rdi + popcntq %rdx, %rdx + salq $5, %rdi + vmovdqa (%r8,%rdi), %ymm2 + vpslld $28, %ymm2, %ymm5 + vpermd %ymm1, %ymm2, %ymm2 + vpmaskmovq %ymm2, %ymm5, (%rax) + leaq (%rax,%rdx,8), %rax + vmovmskpd %ymm3, %edx + movq %rdx, %rdi + popcntq %rdx, %rdx + salq $5, %rdi + vmovdqa (%r8,%rdi), %ymm2 + vpermd %ymm1, %ymm2, %ymm1 + vmovdqu %ymm1, (%r12,%rcx,8) + addq %rdx, %rcx +.L1548: + leaq -4(%rsi), %rdx + leaq 1(%rsi), %rdi + andq $-4, %rdx + leaq 0(,%rcx,8), %r9 + addq $4, %rdx + cmpq $4, %rdi + movl $4, %edi + cmovbe %rdi, %rdx +.L1547: + cmpq %rsi, %rdx + je .L1549 + subq %rdx, %rsi + vmovdqu 0(%r13,%rdx,8), %ymm2 + vmovq %rsi, %xmm1 + vpbroadcastq %xmm1, %ymm1 + vpcmpgtq %ymm0, %ymm2, %ymm3 + vpcmpgtq .LC3(%rip), %ymm1, %ymm1 + vpandn %ymm1, %ymm3, %ymm5 + vpand %ymm1, %ymm3, %ymm3 + vmovmskpd %ymm5, %edx + movq %rdx, %rsi + popcntq %rdx, %rdx + salq $5, %rsi + vmovdqa (%r8,%rsi), %ymm5 + vpslld $28, %ymm5, %ymm6 + vpermd %ymm2, %ymm5, %ymm5 + vpmaskmovq %ymm5, %ymm6, (%rax) + leaq (%rax,%rdx,8), %rax + vmovmskpd %ymm3, %edx + movq %rdx, %rsi + popcntq %rdx, %rdx + addq %rdx, %rcx + salq $5, %rsi + vmovdqa (%r8,%rsi), %ymm1 + vpermd %ymm2, %ymm1, %ymm2 + vmovdqu %ymm2, (%r12,%r9) + leaq 0(,%rcx,8), %r9 +.L1549: + movq %r10, %rdx + subq %rcx, %rdx + leaq 0(%r13,%rdx,8), %r11 + cmpl $8, %r9d + jnb .L1550 + testl %r9d, %r9d + jne .L1658 +.L1551: + cmpl $8, %r9d + jnb .L1554 + testl %r9d, %r9d + jne .L1659 +.L1555: + movq %rax, %rcx + subq %r13, %rcx + sarq $3, %rcx + subq %rcx, %r10 + subq %rcx, %rdx + movq %rcx, %r15 + movq %r10, -144(%rbp) + leaq (%rax,%rdx,8), %rcx + je .L1597 + leaq 128(%rax), %rsi + leaq -128(%rcx), %r10 + vmovdqu (%rax), %ymm12 + vmovdqu 32(%rax), %ymm11 + vmovdqu 64(%rax), %ymm10 + vmovdqu 96(%rax), %ymm9 + vmovdqu -128(%rcx), %ymm8 + vmovdqu -96(%rcx), %ymm7 + vmovdqu -64(%rcx), %ymm6 + vmovdqu -32(%rcx), %ymm5 + cmpq %r10, %rsi + je .L1598 + xorl %ecx, %ecx + leaq _ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices(%rip), %rdi + movl $4, %r11d + jmp .L1562 + .p2align 4,,10 + .p2align 3 +.L1661: + vmovdqu -128(%r10), %ymm13 + vmovdqu -96(%r10), %ymm3 + prefetcht0 -512(%r10) + addq $-128, %r10 + vmovdqu 64(%r10), %ymm2 + vmovdqu 96(%r10), %ymm1 +.L1561: + vpcmpgtq %ymm0, %ymm13, %ymm14 + vmovmskpd %ymm14, %r9d + movq %r9, %r14 + popcntq %r9, %r9 + salq $5, %r14 + vmovdqa (%rdi,%r14), %ymm14 + leaq -4(%rdx,%rcx), %r14 + vpermd %ymm13, %ymm14, %ymm13 + vmovdqu %ymm13, (%rax,%rcx,8) + addq $4, %rcx + vmovdqu %ymm13, (%rax,%r14,8) + vpcmpgtq %ymm0, %ymm3, %ymm13 + subq %r9, %rcx + vmovmskpd %ymm13, %r9d + movq %r9, %r14 + popcntq %r9, %r9 + salq $5, %r14 + vmovdqa (%rdi,%r14), %ymm13 + leaq -8(%rcx,%rdx), %r14 + vpermd %ymm3, %ymm13, %ymm3 + vmovdqu %ymm3, (%rax,%rcx,8) + vmovdqu %ymm3, (%rax,%r14,8) + vpcmpgtq %ymm0, %ymm2, %ymm3 + movq %r11, %r14 + subq %r9, %r14 + addq %r14, %rcx + vmovmskpd %ymm3, %r9d + movq %r9, %r14 + popcntq %r9, %r9 + salq $5, %r14 + vmovdqa (%rdi,%r14), %ymm3 + leaq -12(%rcx,%rdx), %r14 + subq $16, %rdx + vpermd %ymm2, %ymm3, %ymm2 + vmovdqu %ymm2, (%rax,%rcx,8) + vmovdqu %ymm2, (%rax,%r14,8) + vpcmpgtq %ymm0, %ymm1, %ymm2 + movq %r11, %r14 + subq %r9, %r14 + leaq (%r14,%rcx), %r9 + vmovmskpd %ymm2, %ecx + movq %rcx, %r14 + popcntq %rcx, %rcx + salq $5, %r14 + vmovdqa (%rdi,%r14), %ymm2 + leaq (%r9,%rdx), %r14 + vpermd %ymm1, %ymm2, %ymm1 + vmovdqu %ymm1, (%rax,%r9,8) + vmovdqu %ymm1, (%rax,%r14,8) + movq %r11, %r14 + subq %rcx, %r14 + leaq (%r14,%r9), %rcx + cmpq %r10, %rsi + je .L1660 +.L1562: + movq %rsi, %r9 + subq %rax, %r9 + sarq $3, %r9 + subq %rcx, %r9 + cmpq $16, %r9 + ja .L1661 + vmovdqu (%rsi), %ymm13 + vmovdqu 32(%rsi), %ymm3 + prefetcht0 512(%rsi) + subq $-128, %rsi + vmovdqu -64(%rsi), %ymm2 + vmovdqu -32(%rsi), %ymm1 + jmp .L1561 + .p2align 4,,10 + .p2align 3 +.L1655: + movl $8, %eax + subq %rdx, %rax + leaq (%rdi,%rax,8), %rax + movq -88(%rbp), %rdi + leaq -8(%rdx,%rdi), %r14 + jmp .L1501 + .p2align 4,,10 + .p2align 3 +.L1660: + leaq (%rdx,%rcx), %r9 + leaq (%rax,%rcx,8), %r10 + addq $4, %rcx +.L1559: + vpcmpgtq %ymm0, %ymm12, %ymm1 + vmovmskpd %ymm1, %esi + movq %rsi, %r11 + popcntq %rsi, %rsi + subq %rsi, %rcx + salq $5, %r11 + vmovdqa (%rdi,%r11), %ymm1 + vpermd %ymm12, %ymm1, %ymm12 + vpcmpgtq %ymm0, %ymm11, %ymm1 + vmovdqu %ymm12, (%r10) + vmovdqu %ymm12, -32(%rax,%r9,8) + vmovmskpd %ymm1, %esi + movq %rsi, %r9 + popcntq %rsi, %rsi + salq $5, %r9 + vmovdqa (%rdi,%r9), %ymm1 + leaq -8(%rdx,%rcx), %r9 + vpermd %ymm11, %ymm1, %ymm11 + vpcmpgtq %ymm0, %ymm10, %ymm1 + vmovdqu %ymm11, (%rax,%rcx,8) + subq %rsi, %rcx + vmovdqu %ymm11, (%rax,%r9,8) + addq $4, %rcx + vmovmskpd %ymm1, %r9d + movq %r9, %rsi + popcntq %r9, %r9 + salq $5, %rsi + vmovdqa (%rdi,%rsi), %ymm1 + leaq -12(%rdx,%rcx), %rsi + vpermd %ymm10, %ymm1, %ymm10 + vpcmpgtq %ymm0, %ymm9, %ymm1 + vmovdqu %ymm10, (%rax,%rcx,8) + vmovdqu %ymm10, (%rax,%rsi,8) + movl $4, %esi + movq %rsi, %r10 + subq %r9, %r10 + vmovmskpd %ymm1, %r9d + addq %r10, %rcx + movq %r9, %r10 + popcntq %r9, %r9 + salq $5, %r10 + vmovdqa (%rdi,%r10), %ymm1 + leaq -16(%rdx,%rcx), %r10 + vpermd %ymm9, %ymm1, %ymm9 + vpcmpgtq %ymm0, %ymm8, %ymm1 + vmovdqu %ymm9, (%rax,%rcx,8) + vmovdqu %ymm9, (%rax,%r10,8) + movq %rsi, %r10 + subq %r9, %r10 + leaq (%r10,%rcx), %r9 + vmovmskpd %ymm1, %ecx + movq %rcx, %r10 + popcntq %rcx, %rcx + salq $5, %r10 + vmovdqa (%rdi,%r10), %ymm1 + leaq -20(%rdx,%r9), %r10 + vpermd %ymm8, %ymm1, %ymm8 + vpcmpgtq %ymm0, %ymm7, %ymm1 + vmovdqu %ymm8, (%rax,%r9,8) + vmovdqu %ymm8, (%rax,%r10,8) + movq %rsi, %r10 + subq %rcx, %r10 + leaq (%r10,%r9), %rcx + vmovmskpd %ymm1, %r9d + movq %r9, %r10 + popcntq %r9, %r9 + salq $5, %r10 + vmovdqa (%rdi,%r10), %ymm1 + leaq -24(%rdx,%rcx), %r10 + vpermd %ymm7, %ymm1, %ymm7 + vpcmpgtq %ymm0, %ymm6, %ymm1 + vmovdqu %ymm7, (%rax,%rcx,8) + vmovdqu %ymm7, (%rax,%r10,8) + movq %rsi, %r10 + subq %r9, %r10 + leaq (%r10,%rcx), %r9 + vmovmskpd %ymm1, %ecx + movq %rcx, %r10 + salq $5, %r10 + vmovdqa (%rdi,%r10), %ymm1 + leaq -28(%rdx,%r9), %r10 + vpermd %ymm6, %ymm1, %ymm6 + vpcmpgtq %ymm0, %ymm5, %ymm1 + vmovdqu %ymm6, (%rax,%r9,8) + vmovdqu %ymm6, (%rax,%r10,8) + xorl %r10d, %r10d + popcntq %rcx, %r10 + movq %rsi, %rcx + subq %r10, %rcx + addq %r9, %rcx + vmovmskpd %ymm1, %r9d + movq %r9, %r10 + leaq -32(%rdx,%rcx), %rdx + popcntq %r9, %r9 + subq %r9, %rsi + salq $5, %r10 + vmovdqa (%rdi,%r10), %ymm1 + movq -144(%rbp), %rdi + vpermd %ymm5, %ymm1, %ymm5 + vmovdqu %ymm5, (%rax,%rcx,8) + vmovdqu %ymm5, (%rax,%rdx,8) + leaq (%rsi,%rcx), %rdx + subq %rdx, %rdi + leaq (%rax,%rdx,8), %rcx +.L1558: + movq %rcx, %rsi + cmpq $3, %rdi + ja .L1563 + movq -144(%rbp), %rdi + leaq -32(%rax,%rdi,8), %rsi +.L1563: + vpcmpgtq %ymm0, %ymm4, %ymm0 + vpcmpeqd %ymm1, %ymm1, %ymm1 + vmovdqu (%rsi), %ymm6 + movq -144(%rbp), %rdi + vmovdqu %ymm6, (%rax,%rdi,8) + vpxor %ymm1, %ymm0, %ymm1 + vmovdqa %ymm6, -176(%rbp) + vmovmskpd %ymm1, %esi + movq %rsi, %rdi + popcntq %rsi, %rsi + addq %rsi, %rdx + salq $5, %rdi + leaq (%r15,%rdx), %r14 + vmovdqa (%r8,%rdi), %ymm1 + vpslld $28, %ymm1, %ymm2 + vpermd %ymm4, %ymm1, %ymm1 + vpmaskmovq %ymm1, %ymm2, (%rcx) + vmovmskpd %ymm0, %ecx + salq $5, %rcx + vmovdqa (%r8,%rcx), %ymm0 + vpslld $28, %ymm0, %ymm1 + vpermd %ymm4, %ymm0, %ymm4 + vpmaskmovq %ymm4, %ymm1, (%rax,%rdx,8) + movq -96(%rbp), %r15 + subq $1, %r15 + cmpl $2, -112(%rbp) + je .L1662 + movq -104(%rbp), %rsi + movq %r15, %r9 + movq %rbx, %r8 + movq %r12, %rcx + movq %r14, %rdx + movq %r13, %rdi + vzeroupper + call _ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + cmpl $3, -112(%rbp) + je .L1649 +.L1565: + movq -88(%rbp), %rdx + movq -104(%rbp), %rsi + leaq 0(%r13,%r14,8), %rdi + movq %r15, %r9 + movq %rbx, %r8 + movq %r12, %rcx + subq %r14, %rdx + call _ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1649: + subq $-128, %rsp + popq %rbx + popq %r10 + .cfi_remember_state + .cfi_def_cfa 10, 0 + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + leaq -8(%r10), %rsp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1554: + .cfi_restore_state + movq (%r12), %rcx + leaq 8(%r11), %rdi + andq $-8, %rdi + movq %rcx, (%r11) + movl %r9d, %ecx + movq -8(%r12,%rcx), %rsi + movq %rsi, -8(%r11,%rcx) + subq %rdi, %r11 + movq %r12, %rsi + leal (%r9,%r11), %ecx + subq %r11, %rsi + shrl $3, %ecx + rep movsq + jmp .L1555 + .p2align 4,,10 + .p2align 3 +.L1550: + movq (%r11), %rcx + leaq 8(%rax), %rdi + andq $-8, %rdi + movq %rcx, (%rax) + movl %r9d, %ecx + movq -8(%r11,%rcx), %rsi + movq %rsi, -8(%rax,%rcx) + movq %rax, %rcx + movq %r11, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %r9d, %ecx + shrl $3, %ecx + rep movsq + jmp .L1551 + .p2align 4,,10 + .p2align 3 +.L1659: + movzbl (%r12), %ecx + movb %cl, (%r11) + jmp .L1555 + .p2align 4,,10 + .p2align 3 +.L1658: + movzbl (%r11), %ecx + movb %cl, (%rax) + jmp .L1551 + .p2align 4,,10 + .p2align 3 +.L1654: + cmpq $1, %rdx + jbe .L1649 + leaq 512(%rdi), %rax + cmpq %rax, %rsi + jb .L1663 + movl $4, %esi + call _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + jmp .L1649 + .p2align 4,,10 + .p2align 3 +.L1503: + movq -144(%rbp), %rax + movl $4, %edi + vmovdqa .LC3(%rip), %ymm4 + vpcmpeqq 0(%r13), %ymm3, %ymm0 + andl $3, %eax + subq %rax, %rdi + vmovq %rdi, %xmm6 + vpbroadcastq %xmm6, %ymm1 + vpcmpgtq %ymm4, %ymm1, %ymm1 + vpandn %ymm1, %ymm0, %ymm0 + vmovmskpd %ymm0, %eax + testl %eax, %eax + jne .L1664 + vpxor %xmm1, %xmm1, %xmm1 + movq -88(%rbp), %r8 + leaq 512(%r13,%rdi,8), %rsi + vmovdqa %ymm1, %ymm0 + vmovdqa %ymm1, %ymm5 + .p2align 4,,10 + .p2align 3 +.L1509: + movq %rdi, %rcx + leaq 64(%rdi), %rdi + cmpq %rdi, %r8 + jb .L1665 + leaq -512(%rsi), %rax +.L1508: + vpxor (%rax), %ymm3, %ymm2 + leaq 64(%rax), %rdx + vpor %ymm0, %ymm2, %ymm0 + vpxor 32(%rax), %ymm3, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpxor 64(%rax), %ymm3, %ymm2 + vpor %ymm0, %ymm2, %ymm0 + vpxor 96(%rax), %ymm3, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpxor 128(%rax), %ymm3, %ymm2 + vpor %ymm0, %ymm2, %ymm0 + vpxor 160(%rax), %ymm3, %ymm2 + leaq 192(%rdx), %rax + vpor %ymm1, %ymm2, %ymm1 + vpxor 128(%rdx), %ymm3, %ymm2 + vpor %ymm0, %ymm2, %ymm0 + vpxor 160(%rdx), %ymm3, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + cmpq %rsi, %rax + jne .L1508 + vpor %ymm1, %ymm0, %ymm2 + leaq 704(%rdx), %rsi + vpcmpeqq %ymm5, %ymm2, %ymm2 + vmovmskpd %ymm2, %eax + cmpl $15, %eax + je .L1509 + vpcmpeqq 0(%r13,%rcx,8), %ymm3, %ymm0 + vpcmpeqd %ymm1, %ymm1, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vmovmskpd %ymm0, %eax + testl %eax, %eax + jne .L1511 + .p2align 4,,10 + .p2align 3 +.L1510: + addq $4, %rcx + vpcmpeqq 0(%r13,%rcx,8), %ymm3, %ymm0 + vpxor %ymm1, %ymm0, %ymm0 + vmovmskpd %ymm0, %eax + testl %eax, %eax + je .L1510 +.L1511: + tzcntl %eax, %eax + addq %rcx, %rax +.L1507: + vpbroadcastq 0(%r13,%rax,8), %ymm1 + leaq 0(%r13,%rax,8), %rdi + vpcmpgtq %ymm3, %ymm1, %ymm0 + vmovmskpd %ymm0, %edx + testl %edx, %edx + jne .L1516 + movq -88(%rbp), %rsi + xorl %ecx, %ecx + leaq -4(%rsi), %rax + jmp .L1521 + .p2align 4,,10 + .p2align 3 +.L1517: + vmovmskpd %ymm0, %edx + vmovdqu %ymm3, 0(%r13,%rax,8) + popcntq %rdx, %rdx + addq %rdx, %rcx + leaq -4(%rax), %rdx + cmpq %rdx, %rsi + jbe .L1666 + movq %rdx, %rax +.L1521: + vpcmpeqq 0(%r13,%rax,8), %ymm1, %ymm2 + vpcmpeqq 0(%r13,%rax,8), %ymm3, %ymm0 + vpor %ymm0, %ymm2, %ymm5 + vmovmskpd %ymm5, %edx + cmpl $15, %edx + je .L1517 + vpcmpeqd %ymm3, %ymm3, %ymm3 + leaq 4(%rax), %rsi + vpxor %ymm3, %ymm0, %ymm0 + vpandn %ymm0, %ymm2, %ymm2 + vmovmskpd %ymm2, %edx + tzcntl %edx, %edx + addq %rax, %rdx + addq $8, %rax + vpbroadcastq 0(%r13,%rdx,8), %ymm3 + movq -88(%rbp), %rdx + subq %rcx, %rdx + vmovdqa %ymm3, -80(%rbp) + cmpq %rdx, %rax + ja .L1518 + .p2align 4,,10 + .p2align 3 +.L1519: + vmovdqu %ymm1, -32(%r13,%rax,8) + movq %rax, %rsi + addq $4, %rax + cmpq %rax, %rdx + jnb .L1519 +.L1518: + subq %rsi, %rdx + vmovq %rdx, %xmm6 + vpbroadcastq %xmm6, %ymm0 + vpcmpgtq %ymm4, %ymm0, %ymm0 + vpmaskmovq %ymm1, %ymm0, 0(%r13,%rsi,8) +.L1520: + vpbroadcastq (%r12), %ymm0 + vpcmpeqq .LC14(%rip), %ymm0, %ymm2 + vmovmskpd %ymm2, %eax + cmpl $15, %eax + je .L1583 + vpcmpeqq .LC13(%rip), %ymm0, %ymm2 + vmovmskpd %ymm2, %eax + cmpl $15, %eax + je .L1534 + vpcmpgtq %ymm1, %ymm3, %ymm4 + vpblendvb %ymm4, %ymm1, %ymm3, %ymm2 + vpcmpgtq %ymm2, %ymm0, %ymm2 + vmovmskpd %ymm2, %eax + testl %eax, %eax + jne .L1667 + vmovdqa %ymm0, %ymm2 + movl $64, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1529: + leaq (%rcx,%rax,4), %rdx + addq $1, %rax + vmovdqu 0(%r13,%rdx,8), %ymm1 + vpcmpgtq %ymm2, %ymm1, %ymm3 + vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 + vmovdqa %ymm1, %ymm2 + cmpq $16, %rax + jne .L1529 + vpcmpgtq %ymm1, %ymm0, %ymm1 + vmovmskpd %ymm1, %eax + testl %eax, %eax + jne .L1653 + leaq 64(%rsi), %rax + cmpq %rax, -88(%rbp) + jb .L1668 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1529 + .p2align 4,,10 + .p2align 3 +.L1596: + xorl %r9d, %r9d + xorl %ecx, %ecx + leaq _ZZN3hwy6N_AVX26detail18IndicesFromBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices(%rip), %r8 + movq %r13, %rax + jmp .L1547 + .p2align 4,,10 + .p2align 3 +.L1662: + vzeroupper + jmp .L1565 + .p2align 4,,10 + .p2align 3 +.L1597: + movq %r10, %rdi + movq %rax, %rcx + jmp .L1558 + .p2align 4,,10 + .p2align 3 +.L1598: + movq %rax, %r10 + movq %rdx, %r9 + movl $4, %ecx + leaq _ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices(%rip), %rdi + jmp .L1559 +.L1657: + movq -88(%rbp), %rsi + leaq -1(%rsi), %rbx + movq %rbx, %r12 + shrq %r12 + .p2align 4,,10 + .p2align 3 +.L1545: + movq %r12, %rdx + movq %r13, %rdi + call _ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + subq $1, %r12 + jnb .L1545 + .p2align 4,,10 + .p2align 3 +.L1546: + movq 0(%r13,%rbx,8), %rdx + movq 0(%r13), %rax + movq %rbx, %rsi + movq %r13, %rdi + movq %rdx, 0(%r13) + xorl %edx, %edx + movq %rax, 0(%r13,%rbx,8) + call _ZN3hwy6N_AVX26detail8SiftDownINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_mm.isra.0 + subq $1, %rbx + jne .L1546 + jmp .L1649 +.L1585: + movl $12, %eax + movl $11, %esi + jmp .L1543 + .p2align 4,,10 + .p2align 3 +.L1544: + cmpq $23, %rax + je .L1652 +.L1543: + movq %rax, %rcx + addq $1, %rax + cmpq (%r12,%rax,8), %rdx + je .L1544 + movl $12, %edi + subq $11, %rcx + movq %rdx, %rax + subq %rsi, %rdi + cmpq %rdi, %rcx + jb .L1542 +.L1652: + movq (%r12,%rsi,8), %rax + jmp .L1542 +.L1586: + movl $9, %esi + movl $10, %eax + jmp .L1543 +.L1587: + movl $8, %esi + movl $9, %eax + jmp .L1543 +.L1588: + movl $7, %esi + movl $8, %eax + jmp .L1543 +.L1589: + movl $6, %esi + movl $7, %eax + jmp .L1543 +.L1665: + movq -88(%rbp), %rsi + vpcmpeqd %ymm1, %ymm1, %ymm1 + .p2align 4,,10 + .p2align 3 +.L1513: + movq %rcx, %rdx + addq $4, %rcx + cmpq %rcx, %rsi + jb .L1669 + vpcmpeqq -32(%r13,%rcx,8), %ymm3, %ymm0 + vpxor %ymm1, %ymm0, %ymm0 + vmovmskpd %ymm0, %eax + testl %eax, %eax + je .L1513 +.L1651: + tzcntl %eax, %eax + addq %rdx, %rax + jmp .L1507 +.L1590: + movl $5, %esi + movl $6, %eax + jmp .L1543 +.L1591: + movl $4, %esi + movl $5, %eax + jmp .L1543 +.L1516: + movq -88(%rbp), %rsi + leaq -80(%rbp), %rdx + movq %r12, %rcx + vmovdqa %ymm3, %ymm0 + vmovdqa %ymm1, -144(%rbp) + subq %rax, %rsi + call _ZN3hwy6N_AVX26detail22MaybePartitionTwoValueINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEbT_T0_PT1_mDTcl4ZerocvSB__EEESF_RSF_SE_.isra.0 + testb %al, %al + jne .L1649 + vmovdqa -80(%rbp), %ymm3 + vmovdqa -144(%rbp), %ymm1 + jmp .L1520 +.L1592: + movl $3, %esi + movl $4, %eax + jmp .L1543 +.L1593: + movl $2, %esi + movl $3, %eax + jmp .L1543 +.L1594: + movl $1, %esi + movl $2, %eax + jmp .L1543 +.L1656: + xorl %esi, %esi + movl $1, %eax + jmp .L1543 +.L1666: + vmovq %rax, %xmm6 + vpcmpeqq 0(%r13), %ymm3, %ymm5 + movq -88(%rbp), %rdx + vpbroadcastq %xmm6, %ymm2 + vpcmpeqq 0(%r13), %ymm1, %ymm0 + vpcmpgtq %ymm4, %ymm2, %ymm2 + subq %rcx, %rdx + vpor %ymm5, %ymm0, %ymm0 + vpand %ymm2, %ymm5, %ymm6 + vpcmpeqd %ymm5, %ymm5, %ymm5 + vpxor %ymm5, %ymm2, %ymm2 + vpor %ymm2, %ymm0, %ymm0 + vmovmskpd %ymm0, %esi + cmpl $15, %esi + jne .L1670 + vmovmskpd %ymm6, %ecx + movq %rdx, %rax + vmovdqu %ymm3, 0(%r13) + popcntq %rcx, %rcx + subq %rcx, %rax + cmpq $3, %rax + jbe .L1572 + leaq -4(%rax), %rcx + movq -112(%rbp), %rsi + movq %rcx, %rdx + shrq $2, %rdx + salq $5, %rdx + leaq 32(%r13,%rdx), %rdx + .p2align 4,,10 + .p2align 3 +.L1526: + vmovdqu %ymm1, (%rsi) + addq $32, %rsi + cmpq %rsi, %rdx + jne .L1526 + andq $-4, %rcx + leaq 4(%rcx), %rdx + leaq 0(,%rdx,8), %rcx + subq %rdx, %rax +.L1525: + vmovdqa %ymm1, (%r12) + testq %rax, %rax + je .L1648 + leaq 0(%r13,%rcx), %rdi + leaq 0(,%rax,8), %rdx + movq %r12, %rsi + vzeroupper + call memcpy@PLT + jmp .L1649 +.L1668: + movq -88(%rbp), %rdx + jmp .L1536 + .p2align 4,,10 + .p2align 3 +.L1537: + vpcmpgtq -32(%r13,%rsi,8), %ymm0, %ymm1 + vmovmskpd %ymm1, %eax + testl %eax, %eax + jne .L1653 +.L1536: + movq %rsi, %rax + addq $4, %rsi + cmpq %rsi, %rdx + jnb .L1537 + movq -88(%rbp), %rdi + cmpq %rax, %rdi + je .L1583 + vpcmpgtq -32(%r13,%rdi,8), %ymm0, %ymm1 + vmovmskpd %ymm1, %eax + cmpl $1, %eax + movl $1, %eax + adcl $0, %eax + movl %eax, -112(%rbp) + jmp .L1538 +.L1540: + movl $11, %eax + movl $10, %esi + jmp .L1543 +.L1669: + movq -88(%rbp), %rax + vpcmpeqd %ymm1, %ymm1, %ymm1 + vpcmpeqq -32(%r13,%rax,8), %ymm3, %ymm0 + leaq -4(%rax), %rdx + vpxor %ymm1, %ymm0, %ymm0 + vmovmskpd %ymm0, %eax + testl %eax, %eax + jne .L1651 +.L1648: + vzeroupper + jmp .L1649 +.L1533: + vmovdqu -32(%r13,%rsi,8), %ymm6 + vpcmpgtq %ymm0, %ymm6, %ymm1 + vmovmskpd %ymm1, %eax + testl %eax, %eax + jne .L1653 +.L1532: + movq %rsi, %rax + addq $4, %rsi + cmpq %rsi, -88(%rbp) + jnb .L1533 + movq -88(%rbp), %rdi + cmpq %rax, %rdi + je .L1534 + vmovdqu -32(%r13,%rdi,8), %ymm6 + vpcmpgtq %ymm0, %ymm6, %ymm1 + vmovdqa %ymm6, -144(%rbp) + vmovmskpd %ymm1, %eax + testl %eax, %eax + jne .L1653 +.L1534: + vpcmpeqd %ymm1, %ymm1, %ymm1 + movl $3, -112(%rbp) + vpaddq %ymm1, %ymm0, %ymm0 + jmp .L1538 +.L1663: + movq %rdx, %rcx + xorl %eax, %eax + cmpq $3, %rdx + jbe .L1494 + movq %rdx, %rbx + leaq -4(%rdx), %rdx + movq (%rdi), %rcx + movq %rdx, %rax + shrq $2, %rax + movq %rcx, (%r12) + addq $1, %rax + salq $5, %rax + movl %eax, %ecx + movq -8(%rdi,%rcx), %rsi + leaq 8(%r12), %rdi + andq $-8, %rdi + movq %rsi, -8(%r12,%rcx) + movq %r12, %rcx + movq %r13, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %eax, %ecx + movq %rdx, %rax + andq $-4, %rax + shrl $3, %ecx + rep movsq + addq $4, %rax + subq %rax, %rbx + movq %rbx, %rcx + je .L1497 +.L1494: + salq $3, %rax + leaq 0(,%rcx,8), %rdx + testq %rcx, %rcx + movl $8, %ecx + cmove %rcx, %rdx + leaq (%r12,%rax), %rdi + leaq 0(%r13,%rax), %rsi + call memcpy@PLT +.L1497: + movq -88(%rbp), %rbx + movl $32, %edx + movl $1, %esi + movl %ebx, %eax + subl $1, %eax + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %edx + movl $1, %eax + shlx %rdx, %rsi, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + addq $4, %rdx + cmpq %rdx, %rbx + jnb .L1496 + vmovdqa .LC13(%rip), %ymm0 + movq %rbx, %rax + .p2align 4,,10 + .p2align 3 +.L1495: + vmovdqu %ymm0, (%r12,%rax,8) + addq $4, %rax + cmpq %rdx, %rax + jb .L1495 + vzeroupper +.L1496: + movq %r12, %rdi + call _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + xorl %eax, %eax + cmpq $3, -88(%rbp) + jbe .L1499 + movq -88(%rbp), %rbx + movq (%r12), %rcx + leaq 8(%r13), %rdi + andq $-8, %rdi + leaq -4(%rbx), %rdx + movq %rcx, 0(%r13) + movq %rdx, %rax + shrq $2, %rax + addq $1, %rax + salq $5, %rax + movl %eax, %ecx + movq -8(%r12,%rcx), %rsi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + movq %r12, %rsi + subq %rdi, %rcx + subq %rcx, %rsi + addl %eax, %ecx + movq %rdx, %rax + andq $-4, %rax + shrl $3, %ecx + rep movsq + addq $4, %rax + subq %rax, %rbx + movq %rbx, -88(%rbp) + je .L1649 +.L1499: + movq -88(%rbp), %rbx + salq $3, %rax + movl $8, %ecx + leaq 0(%r13,%rax), %rdi + leaq (%r12,%rax), %rsi + leaq 0(,%rbx,8), %rdx + testq %rbx, %rbx + cmove %rcx, %rdx + call memcpy@PLT + jmp .L1649 +.L1664: + tzcntl %eax, %eax + jmp .L1507 +.L1583: + movl $2, -112(%rbp) + jmp .L1538 +.L1667: + vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 + vpcmpgtq %ymm0, %ymm1, %ymm1 + vmovmskpd %ymm1, %eax + testl %eax, %eax + jne .L1653 + vmovdqa %ymm0, %ymm2 + movl $64, %esi + xorl %ecx, %ecx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +.L1530: + leaq (%rcx,%rax,4), %rdx + addq $1, %rax + vmovdqu 0(%r13,%rdx,8), %ymm1 + vpcmpgtq %ymm2, %ymm1, %ymm3 + vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 + vmovdqa %ymm1, %ymm2 + cmpq $16, %rax + jne .L1530 + vpcmpgtq %ymm0, %ymm1, %ymm1 + vmovmskpd %ymm1, %eax + testl %eax, %eax + jne .L1653 + leaq 64(%rsi), %rax + cmpq %rax, -88(%rbp) + jb .L1532 + movq %rsi, %rcx + movq %rax, %rsi + xorl %eax, %eax + jmp .L1530 +.L1572: + xorl %ecx, %ecx + jmp .L1525 +.L1670: + vpxor %ymm5, %ymm0, %ymm0 + vmovmskpd %ymm0, %ecx + tzcntl %ecx, %ecx + vpbroadcastq 0(%r13,%rcx,8), %ymm3 + leaq 4(%rax), %rcx + vmovdqa %ymm3, -80(%rbp) + cmpq %rdx, %rcx + ja .L1523 +.L1524: + vmovdqu %ymm1, -32(%r13,%rcx,8) + movq %rcx, %rax + addq $4, %rcx + cmpq %rdx, %rcx + jbe .L1524 +.L1523: + subq %rax, %rdx + vmovq %rdx, %xmm0 + vpbroadcastq %xmm0, %ymm0 + vpcmpgtq %ymm4, %ymm0, %ymm0 + vpmaskmovq %ymm1, %ymm0, 0(%r13,%rax,8) + jmp .L1520 + .cfi_endproc +.LFE18808: + .size _ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0, .-_ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 + .section .text.unlikely._ZN3hwy7N_SSSE310SortI64AscEPlm,"ax",@progbits +.LCOLDB15: + .section .text._ZN3hwy7N_SSSE310SortI64AscEPlm,"ax",@progbits +.LHOTB15: + .p2align 4 + .globl _ZN3hwy7N_SSSE310SortI64AscEPlm + .hidden _ZN3hwy7N_SSSE310SortI64AscEPlm + .type _ZN3hwy7N_SSSE310SortI64AscEPlm, @function +_ZN3hwy7N_SSSE310SortI64AscEPlm: +.LFB2946: + .cfi_startproc + cmpq $1, %rsi + jbe .L1692 + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + pushq %r13 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + .cfi_offset 13, -40 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -48 + movq %rsi, %r12 + pushq %rbx + subq $312, %rsp + .cfi_offset 3, -56 + cmpq $32, %rsi + jbe .L1695 + cmpq $0, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + je .L1696 +.L1682: + leaq -336(%rbp), %rcx + leaq 0(%r13,%r12,8), %rsi + movq %r12, %rdx + movq %r13, %rdi + movl $50, %r9d + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %r8 + call _ZN3hwy7N_SSSE36detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1671: + addq $312, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1692: + .cfi_restore 3 + .cfi_restore 6 + .cfi_restore 12 + .cfi_restore 13 + .cfi_restore 14 + .cfi_restore 15 + ret + .p2align 4,,10 + .p2align 3 +.L1695: + .cfi_def_cfa 6, 16 + .cfi_offset 3, -56 + .cfi_offset 6, -16 + .cfi_offset 12, -48 + .cfi_offset 13, -40 + .cfi_offset 14, -32 + .cfi_offset 15, -24 + leaq 0(,%rsi,8), %r8 + leaq 256(%rdi), %rax + leaq (%rdi,%r8), %rdx + cmpq %rax, %rdx + jb .L1675 + movl $2, %esi + call _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + jmp .L1671 + .p2align 4,,10 + .p2align 3 +.L1696: + xorl %edx, %edx + movl $16, %esi + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %rdi + call getrandom@PLT + cmpq $16, %rax + jne .L1690 + movq $1, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + jmp .L1682 +.L1675: + leaq -2(%rsi), %r14 + movq %r12, %rdx + leaq -336(%rbp), %r15 + movq %r13, %rsi + movq %r14, %rbx + andq $-2, %r14 + movq %r15, %rdi + shrq %rbx + addq $2, %r14 + addq $1, %rbx + salq $4, %rbx + movl %ebx, %ecx + shrl $3, %ecx + subq %r14, %rdx + rep movsq + movq %rdx, -344(%rbp) + je .L1679 + leaq 0(,%r14,8), %rax + salq $3, %rdx + movq %r8, -352(%rbp) + leaq (%r15,%rax), %rdi + leaq 0(%r13,%rax), %rsi + call memcpy@PLT + movq -352(%rbp), %r8 +.L1679: + leal -1(%r12), %eax + movl $32, %ecx + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rax + shrq $4, %rax + movq %rax, %rsi + movl $1, %eax + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + leaq 2(%rdx), %rax + cmpq %rax, %r12 + jnb .L1680 + movdqa .LC4(%rip), %xmm0 + leaq 2(%r12), %rdx + movups %xmm0, -336(%rbp,%r8) + cmpq %rdx, %rax + jbe .L1680 + movups %xmm0, -320(%rbp,%r8) + leaq 4(%r12), %rdx + cmpq %rdx, %rax + jbe .L1680 + movups %xmm0, -304(%rbp,%r8) + leaq 6(%r12), %rdx + cmpq %rdx, %rax + jbe .L1680 + movups %xmm0, -288(%rbp,%r8) + leaq 8(%r12), %rdx + cmpq %rdx, %rax + jbe .L1680 + movups %xmm0, -272(%rbp,%r8) + leaq 10(%r12), %rdx + cmpq %rdx, %rax + jbe .L1680 + movups %xmm0, -256(%rbp,%r8) + leaq 12(%r12), %rdx + cmpq %rdx, %rax + jbe .L1680 + movups %xmm0, -240(%rbp,%r8) + leaq 14(%r12), %rdx + cmpq %rdx, %rax + jbe .L1680 + movups %xmm0, -224(%rbp,%r8) + leaq 16(%r12), %rdx + cmpq %rdx, %rax + jbe .L1680 + movups %xmm0, -208(%rbp,%r8) + leaq 18(%r12), %rdx + cmpq %rdx, %rax + jbe .L1680 + movups %xmm0, -192(%rbp,%r8) + leaq 20(%r12), %rdx + cmpq %rdx, %rax + jbe .L1680 + movups %xmm0, -176(%rbp,%r8) + leaq 22(%r12), %rdx + cmpq %rdx, %rax + jbe .L1680 + movups %xmm0, -160(%rbp,%r8) + leaq 24(%r12), %rdx + cmpq %rdx, %rax + jbe .L1680 + movups %xmm0, -144(%rbp,%r8) + leaq 26(%r12), %rdx + cmpq %rdx, %rax + jbe .L1680 + leaq 28(%r12), %rdx + movups %xmm0, -128(%rbp,%r8) + cmpq %rdx, %rax + jbe .L1680 + addq $30, %r12 + movups %xmm0, -112(%rbp,%r8) + cmpq %r12, %rax + jbe .L1680 + movups %xmm0, -96(%rbp,%r8) +.L1680: + movq %r15, %rdi + call _ZN3hwy7N_SSSE36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq -336(%rbp), %rax + leaq 8(%r13), %rdi + movq %r15, %rsi + andq $-8, %rdi + movq %rax, 0(%r13) + movl %ebx, %eax + movq -8(%r15,%rax), %rdx + movq %rdx, -8(%r13,%rax) + movq %r13, %rax + subq %rdi, %rax + addl %eax, %ebx + subq %rax, %rsi + shrl $3, %ebx + movl %ebx, %ecx + rep movsq + movq -344(%rbp), %rax + testq %rax, %rax + je .L1671 + salq $3, %r14 + salq $3, %rax + leaq 0(%r13,%r14), %rdi + movq %rax, %rdx + leaq (%r15,%r14), %rsi + call memcpy@PLT + jmp .L1671 + .cfi_endproc + .section .text.unlikely._ZN3hwy7N_SSSE310SortI64AscEPlm + .cfi_startproc + .type _ZN3hwy7N_SSSE310SortI64AscEPlm.cold, @function +_ZN3hwy7N_SSSE310SortI64AscEPlm.cold: +.LFSB2946: +.L1690: + .cfi_def_cfa 6, 16 + .cfi_offset 3, -56 + .cfi_offset 6, -16 + .cfi_offset 12, -48 + .cfi_offset 13, -40 + .cfi_offset 14, -32 + .cfi_offset 15, -24 + call abort@PLT + .cfi_endproc +.LFE2946: + .section .text._ZN3hwy7N_SSSE310SortI64AscEPlm + .size _ZN3hwy7N_SSSE310SortI64AscEPlm, .-_ZN3hwy7N_SSSE310SortI64AscEPlm + .section .text.unlikely._ZN3hwy7N_SSSE310SortI64AscEPlm + .size _ZN3hwy7N_SSSE310SortI64AscEPlm.cold, .-_ZN3hwy7N_SSSE310SortI64AscEPlm.cold +.LCOLDE15: + .section .text._ZN3hwy7N_SSSE310SortI64AscEPlm +.LHOTE15: + .section .text.unlikely._ZN3hwy6N_SSE410SortI64AscEPlm,"ax",@progbits +.LCOLDB16: + .section .text._ZN3hwy6N_SSE410SortI64AscEPlm,"ax",@progbits +.LHOTB16: + .p2align 4 + .globl _ZN3hwy6N_SSE410SortI64AscEPlm + .hidden _ZN3hwy6N_SSE410SortI64AscEPlm + .type _ZN3hwy6N_SSE410SortI64AscEPlm, @function +_ZN3hwy6N_SSE410SortI64AscEPlm: +.LFB4014: + .cfi_startproc + cmpq $1, %rsi + jbe .L1718 + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + pushq %r13 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + .cfi_offset 13, -40 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -48 + movq %rsi, %r12 + pushq %rbx + subq $312, %rsp + .cfi_offset 3, -56 + cmpq $32, %rsi + jbe .L1721 + cmpq $0, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + je .L1722 +.L1708: + leaq -336(%rbp), %rcx + leaq 0(%r13,%r12,8), %rsi + movq %r12, %rdx + movq %r13, %rdi + movl $50, %r9d + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %r8 + call _ZN3hwy6N_SSE46detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1697: + addq $312, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1718: + .cfi_restore 3 + .cfi_restore 6 + .cfi_restore 12 + .cfi_restore 13 + .cfi_restore 14 + .cfi_restore 15 + ret + .p2align 4,,10 + .p2align 3 +.L1721: + .cfi_def_cfa 6, 16 + .cfi_offset 3, -56 + .cfi_offset 6, -16 + .cfi_offset 12, -48 + .cfi_offset 13, -40 + .cfi_offset 14, -32 + .cfi_offset 15, -24 + leaq 0(,%rsi,8), %r8 + leaq 256(%rdi), %rax + leaq (%rdi,%r8), %rdx + cmpq %rax, %rdx + jb .L1701 + movl $2, %esi + call _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + jmp .L1697 + .p2align 4,,10 + .p2align 3 +.L1722: + xorl %edx, %edx + movl $16, %esi + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %rdi + call getrandom@PLT + cmpq $16, %rax + jne .L1716 + movq $1, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + jmp .L1708 +.L1701: + leaq -2(%rsi), %r14 + movq %r12, %rdx + leaq -336(%rbp), %r15 + movq %r13, %rsi + movq %r14, %rbx + andq $-2, %r14 + movq %r15, %rdi + shrq %rbx + addq $2, %r14 + addq $1, %rbx + salq $4, %rbx + movl %ebx, %ecx + shrl $3, %ecx + subq %r14, %rdx + rep movsq + movq %rdx, -344(%rbp) + je .L1705 + leaq 0(,%r14,8), %rax + salq $3, %rdx + movq %r8, -352(%rbp) + leaq (%r15,%rax), %rdi + leaq 0(%r13,%rax), %rsi + call memcpy@PLT + movq -352(%rbp), %r8 +.L1705: + leal -1(%r12), %eax + movl $32, %ecx + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rax + shrq $4, %rax + movq %rax, %rsi + movl $1, %eax + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + leaq 2(%rdx), %rax + cmpq %rax, %r12 + jnb .L1706 + movdqa .LC4(%rip), %xmm0 + leaq 2(%r12), %rdx + movups %xmm0, -336(%rbp,%r8) + cmpq %rdx, %rax + jbe .L1706 + movups %xmm0, -320(%rbp,%r8) + leaq 4(%r12), %rdx + cmpq %rdx, %rax + jbe .L1706 + movups %xmm0, -304(%rbp,%r8) + leaq 6(%r12), %rdx + cmpq %rdx, %rax + jbe .L1706 + movups %xmm0, -288(%rbp,%r8) + leaq 8(%r12), %rdx + cmpq %rdx, %rax + jbe .L1706 + movups %xmm0, -272(%rbp,%r8) + leaq 10(%r12), %rdx + cmpq %rdx, %rax + jbe .L1706 + movups %xmm0, -256(%rbp,%r8) + leaq 12(%r12), %rdx + cmpq %rdx, %rax + jbe .L1706 + movups %xmm0, -240(%rbp,%r8) + leaq 14(%r12), %rdx + cmpq %rdx, %rax + jbe .L1706 + movups %xmm0, -224(%rbp,%r8) + leaq 16(%r12), %rdx + cmpq %rdx, %rax + jbe .L1706 + movups %xmm0, -208(%rbp,%r8) + leaq 18(%r12), %rdx + cmpq %rdx, %rax + jbe .L1706 + movups %xmm0, -192(%rbp,%r8) + leaq 20(%r12), %rdx + cmpq %rdx, %rax + jbe .L1706 + movups %xmm0, -176(%rbp,%r8) + leaq 22(%r12), %rdx + cmpq %rdx, %rax + jbe .L1706 + movups %xmm0, -160(%rbp,%r8) + leaq 24(%r12), %rdx + cmpq %rdx, %rax + jbe .L1706 + movups %xmm0, -144(%rbp,%r8) + leaq 26(%r12), %rdx + cmpq %rdx, %rax + jbe .L1706 + leaq 28(%r12), %rdx + movups %xmm0, -128(%rbp,%r8) + cmpq %rdx, %rax + jbe .L1706 + addq $30, %r12 + movups %xmm0, -112(%rbp,%r8) + cmpq %r12, %rax + jbe .L1706 + movups %xmm0, -96(%rbp,%r8) +.L1706: + movq %r15, %rdi + call _ZN3hwy6N_SSE46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq -336(%rbp), %rax + leaq 8(%r13), %rdi + movq %r15, %rsi + andq $-8, %rdi + movq %rax, 0(%r13) + movl %ebx, %eax + movq -8(%r15,%rax), %rdx + movq %rdx, -8(%r13,%rax) + movq %r13, %rax + subq %rdi, %rax + addl %eax, %ebx + subq %rax, %rsi + shrl $3, %ebx + movl %ebx, %ecx + rep movsq + movq -344(%rbp), %rax + testq %rax, %rax + je .L1697 + salq $3, %r14 + salq $3, %rax + leaq 0(%r13,%r14), %rdi + movq %rax, %rdx + leaq (%r15,%r14), %rsi + call memcpy@PLT + jmp .L1697 + .cfi_endproc + .section .text.unlikely._ZN3hwy6N_SSE410SortI64AscEPlm + .cfi_startproc + .type _ZN3hwy6N_SSE410SortI64AscEPlm.cold, @function +_ZN3hwy6N_SSE410SortI64AscEPlm.cold: +.LFSB4014: +.L1716: + .cfi_def_cfa 6, 16 + .cfi_offset 3, -56 + .cfi_offset 6, -16 + .cfi_offset 12, -48 + .cfi_offset 13, -40 + .cfi_offset 14, -32 + .cfi_offset 15, -24 + call abort@PLT + .cfi_endproc +.LFE4014: + .section .text._ZN3hwy6N_SSE410SortI64AscEPlm + .size _ZN3hwy6N_SSE410SortI64AscEPlm, .-_ZN3hwy6N_SSE410SortI64AscEPlm + .section .text.unlikely._ZN3hwy6N_SSE410SortI64AscEPlm + .size _ZN3hwy6N_SSE410SortI64AscEPlm.cold, .-_ZN3hwy6N_SSE410SortI64AscEPlm.cold +.LCOLDE16: + .section .text._ZN3hwy6N_SSE410SortI64AscEPlm +.LHOTE16: + .section .text.unlikely._ZN3hwy6N_AVX210SortI64AscEPlm,"ax",@progbits +.LCOLDB17: + .section .text._ZN3hwy6N_AVX210SortI64AscEPlm,"ax",@progbits +.LHOTB17: + .p2align 4 + .globl _ZN3hwy6N_AVX210SortI64AscEPlm + .hidden _ZN3hwy6N_AVX210SortI64AscEPlm + .type _ZN3hwy6N_AVX210SortI64AscEPlm, @function +_ZN3hwy6N_AVX210SortI64AscEPlm: +.LFB10439: + .cfi_startproc + cmpq $1, %rsi + jbe .L1751 + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r14 + pushq %r13 + .cfi_offset 14, -24 + .cfi_offset 13, -32 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -40 + movq %rsi, %r12 + andq $-32, %rsp + subq $576, %rsp + cmpq $64, %rsi + jbe .L1754 + cmpq $0, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + je .L1755 +.L1738: + movq %rsp, %rcx + leaq 0(%r13,%r12,8), %rsi + movq %r12, %rdx + movq %r13, %rdi + movl $50, %r9d + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %r8 + call _ZN3hwy6N_AVX26detail7RecurseINS0_4SimdIlLm4ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1749: + leaq -24(%rbp), %rsp + popq %r12 + popq %r13 + popq %r14 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1751: + .cfi_restore 6 + .cfi_restore 12 + .cfi_restore 13 + .cfi_restore 14 + ret + .p2align 4,,10 + .p2align 3 +.L1754: + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + .cfi_offset 12, -40 + .cfi_offset 13, -32 + .cfi_offset 14, -24 + leaq (%rdi,%rsi,8), %rax + leaq 512(%rdi), %rdx + cmpq %rdx, %rax + jb .L1756 + movl $4, %esi + call _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + jmp .L1749 + .p2align 4,,10 + .p2align 3 +.L1755: + xorl %edx, %edx + movl $16, %esi + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %rdi + call getrandom@PLT + cmpq $16, %rax + jne .L1748 + movq $1, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + jmp .L1738 +.L1756: + movq %rsi, %rcx + xorl %eax, %eax + movq %rsp, %r14 + cmpq $3, %rsi + jbe .L1729 + leaq -4(%rsi), %rax + movq %rsp, %r14 + movq %r13, %rsi + movq %rax, %rdx + andq $-4, %rax + movq %r14, %rdi + shrq $2, %rdx + addq $4, %rax + leal 4(,%rdx,4), %ecx + andl $536870908, %ecx + rep movsq + movq %r12, %rcx + subq %rax, %rcx + je .L1733 +.L1729: + salq $3, %rax + leaq 0(,%rcx,8), %rdx + testq %rcx, %rcx + movl $8, %ecx + cmove %rcx, %rdx + leaq (%r14,%rax), %rdi + leaq 0(%r13,%rax), %rsi + call memcpy@PLT +.L1733: + leal -1(%r12), %eax + movl $32, %edx + movl $1, %esi + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %edx + movl $1, %eax + shlx %rdx, %rsi, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + addq $4, %rdx + cmpq %rdx, %r12 + jnb .L1734 + vmovdqa .LC13(%rip), %ymm0 + movq %r12, %rax + .p2align 4,,10 + .p2align 3 +.L1735: + vmovdqu %ymm0, (%r14,%rax,8) + addq $4, %rax + cmpq %rax, %rdx + ja .L1735 + vzeroupper +.L1734: + movq %r14, %rdi + call _ZN3hwy6N_AVX26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + cmpq $3, %r12 + jbe .L1740 + leaq -4(%r12), %rdx + movq (%rsp), %rcx + leaq 8(%r13), %rdi + movq %rdx, %rax + andq $-8, %rdi + andq $-4, %rdx + shrq $2, %rax + movq %rcx, 0(%r13) + addq $1, %rax + salq $5, %rax + movl %eax, %ecx + movq -8(%r14,%rcx), %rsi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + movq %r14, %rsi + subq %rdi, %rcx + addl %ecx, %eax + subq %rcx, %rsi + shrl $3, %eax + movl %eax, %ecx + leaq 4(%rdx), %rax + rep movsq + subq %rax, %r12 + je .L1749 +.L1736: + salq $3, %rax + leaq 0(,%r12,8), %rdx + testq %r12, %r12 + movl $8, %ecx + cmove %rcx, %rdx + leaq 0(%r13,%rax), %rdi + leaq (%r14,%rax), %rsi + call memcpy@PLT + jmp .L1749 +.L1740: + xorl %eax, %eax + jmp .L1736 + .cfi_endproc + .section .text.unlikely._ZN3hwy6N_AVX210SortI64AscEPlm + .cfi_startproc + .type _ZN3hwy6N_AVX210SortI64AscEPlm.cold, @function +_ZN3hwy6N_AVX210SortI64AscEPlm.cold: +.LFSB10439: +.L1748: + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + .cfi_offset 12, -40 + .cfi_offset 13, -32 + .cfi_offset 14, -24 + call abort@PLT + .cfi_endproc +.LFE10439: + .section .text._ZN3hwy6N_AVX210SortI64AscEPlm + .size _ZN3hwy6N_AVX210SortI64AscEPlm, .-_ZN3hwy6N_AVX210SortI64AscEPlm + .section .text.unlikely._ZN3hwy6N_AVX210SortI64AscEPlm + .size _ZN3hwy6N_AVX210SortI64AscEPlm.cold, .-_ZN3hwy6N_AVX210SortI64AscEPlm.cold +.LCOLDE17: + .section .text._ZN3hwy6N_AVX210SortI64AscEPlm +.LHOTE17: + .section .text.unlikely._ZN3hwy6N_AVX310SortI64AscEPlm,"ax",@progbits +.LCOLDB18: + .section .text._ZN3hwy6N_AVX310SortI64AscEPlm,"ax",@progbits +.LHOTB18: + .p2align 4 + .globl _ZN3hwy6N_AVX310SortI64AscEPlm + .hidden _ZN3hwy6N_AVX310SortI64AscEPlm + .type _ZN3hwy6N_AVX310SortI64AscEPlm, @function +_ZN3hwy6N_AVX310SortI64AscEPlm: +.LFB12848: + .cfi_startproc + cmpq $1, %rsi + jbe .L1781 + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r13 + .cfi_offset 13, -24 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -32 + movq %rsi, %r12 + pushq %rbx + andq $-64, %rsp + subq $1152, %rsp + .cfi_offset 3, -40 + cmpq $128, %rsi + jbe .L1784 + cmpq $0, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + je .L1785 +.L1772: + movq %rsp, %rcx + leaq 0(%r13,%r12,8), %rsi + movq %r12, %rdx + movq %r13, %rdi + movl $50, %r9d + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %r8 + call _ZN3hwy6N_AVX36detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1779: + leaq -24(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1781: + .cfi_restore 3 + .cfi_restore 6 + .cfi_restore 12 + .cfi_restore 13 + ret + .p2align 4,,10 + .p2align 3 +.L1784: + .cfi_def_cfa 6, 16 + .cfi_offset 3, -40 + .cfi_offset 6, -16 + .cfi_offset 12, -32 + .cfi_offset 13, -24 + leaq (%rdi,%rsi,8), %rax + leaq 1024(%rdi), %rdx + cmpq %rdx, %rax + jb .L1786 + movl $8, %esi + call _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + jmp .L1779 + .p2align 4,,10 + .p2align 3 +.L1785: + xorl %edx, %edx + movl $16, %esi + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %rdi + call getrandom@PLT + cmpq $16, %rax + jne .L1778 + movq $1, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + jmp .L1772 +.L1786: + cmpq $7, %rsi + jbe .L1787 + leaq -8(%rsi), %rax + movq %rsp, %rbx + movq %r13, %rsi + movq %rax, %rdx + andq $-8, %rax + movq %rbx, %rdi + shrq $3, %rdx + addq $8, %rax + leal 8(,%rdx,8), %ecx + leaq 0(,%rax,8), %rdx + andl $536870904, %ecx + rep movsq + movq %r12, %rcx + leaq (%rbx,%rdx), %rdi + addq %r13, %rdx + subq %rax, %rcx + movl $255, %eax + kmovd %eax, %k1 + cmpq $255, %rcx + jbe .L1762 +.L1766: + leal -1(%r12), %eax + movl $32, %ecx + movl $1, %esi + vmovdqu64 (%rdx), %zmm0{%k1}{z} + bsrl %eax, %eax + xorl $31, %eax + vmovdqa64 %zmm0, (%rdi){%k1} + vpbroadcastq .LC10(%rip), %zmm0 + subl %eax, %ecx + movl $1, %eax + shlx %rcx, %rsi, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %r12, %rax + movq %rsi, %rdx + salq $4, %rdx + addq $8, %rdx + cmpq %rdx, %r12 + jnb .L1770 + .p2align 4,,10 + .p2align 3 +.L1767: + vmovdqu64 %zmm0, (%rbx,%rax,8) + addq $8, %rax + cmpq %rdx, %rax + jb .L1767 +.L1770: + movq %rbx, %rdi + vzeroupper + call _ZN3hwy6N_AVX36detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + cmpq $7, %r12 + jbe .L1769 + leaq -8(%r12), %rdx + movq (%rsp), %rcx + leaq 8(%r13), %rdi + movq %rdx, %rax + andq $-8, %rdi + andq $-8, %rdx + shrq $3, %rax + movq %rcx, 0(%r13) + addq $1, %rax + salq $6, %rax + movl %eax, %ecx + movq -8(%rbx,%rcx), %rsi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + movq %rbx, %rsi + subq %rdi, %rcx + addl %ecx, %eax + subq %rcx, %rsi + shrl $3, %eax + movl %eax, %ecx + leaq 8(%rdx), %rax + rep movsq + leaq 0(,%rax,8), %rdx + subq %rax, %r12 + movl $255, %eax + addq %rdx, %r13 + addq %rdx, %rbx + kmovd %eax, %k1 + cmpq $255, %r12 + jbe .L1769 +.L1771: + vmovdqa64 (%rbx), %zmm0{%k1}{z} + vmovdqu64 %zmm0, 0(%r13){%k1} + vzeroupper + jmp .L1779 +.L1787: + movq %rsp, %rbx + movq %rdi, %rdx + movq %rsi, %rcx + movq %rbx, %rdi +.L1762: + movq $-1, %rax + bzhi %rcx, %rax, %rax + movzbl %al, %eax + kmovd %eax, %k1 + jmp .L1766 +.L1769: + movq $-1, %rax + bzhi %r12, %rax, %rax + movzbl %al, %eax + kmovd %eax, %k1 + jmp .L1771 + .cfi_endproc + .section .text.unlikely._ZN3hwy6N_AVX310SortI64AscEPlm + .cfi_startproc + .type _ZN3hwy6N_AVX310SortI64AscEPlm.cold, @function +_ZN3hwy6N_AVX310SortI64AscEPlm.cold: +.LFSB12848: +.L1778: + .cfi_def_cfa 6, 16 + .cfi_offset 3, -40 + .cfi_offset 6, -16 + .cfi_offset 12, -32 + .cfi_offset 13, -24 + call abort@PLT + .cfi_endproc +.LFE12848: + .section .text._ZN3hwy6N_AVX310SortI64AscEPlm + .size _ZN3hwy6N_AVX310SortI64AscEPlm, .-_ZN3hwy6N_AVX310SortI64AscEPlm + .section .text.unlikely._ZN3hwy6N_AVX310SortI64AscEPlm + .size _ZN3hwy6N_AVX310SortI64AscEPlm.cold, .-_ZN3hwy6N_AVX310SortI64AscEPlm.cold +.LCOLDE18: + .section .text._ZN3hwy6N_AVX310SortI64AscEPlm +.LHOTE18: + .section .text.unlikely._ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm,"ax",@progbits +.LCOLDB19: + .section .text._ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm,"ax",@progbits +.LHOTB19: + .p2align 4 + .globl _ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm + .hidden _ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm + .type _ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm, @function +_ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm: +.LFB15264: + .cfi_startproc + cmpq $1, %rsi + jbe .L1812 + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r13 + .cfi_offset 13, -24 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -32 + movq %rsi, %r12 + pushq %rbx + andq $-64, %rsp + subq $1152, %rsp + .cfi_offset 3, -40 + cmpq $128, %rsi + jbe .L1815 + cmpq $0, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + je .L1816 +.L1803: + movq %rsp, %rcx + leaq 0(%r13,%r12,8), %rsi + movq %r12, %rdx + movq %r13, %rdi + movl $50, %r9d + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %r8 + call _ZN3hwy11N_AVX3_ZEN46detail7RecurseINS0_4SimdIlLm8ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1810: + leaq -24(%rbp), %rsp + popq %rbx + popq %r12 + popq %r13 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1812: + .cfi_restore 3 + .cfi_restore 6 + .cfi_restore 12 + .cfi_restore 13 + ret + .p2align 4,,10 + .p2align 3 +.L1815: + .cfi_def_cfa 6, 16 + .cfi_offset 3, -40 + .cfi_offset 6, -16 + .cfi_offset 12, -32 + .cfi_offset 13, -24 + leaq (%rdi,%rsi,8), %rax + leaq 1024(%rdi), %rdx + cmpq %rdx, %rax + jb .L1817 + movl $8, %esi + call _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + jmp .L1810 + .p2align 4,,10 + .p2align 3 +.L1816: + xorl %edx, %edx + movl $16, %esi + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %rdi + call getrandom@PLT + cmpq $16, %rax + jne .L1809 + movq $1, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + jmp .L1803 +.L1817: + cmpq $7, %rsi + jbe .L1818 + leaq -8(%rsi), %rax + movq %rsp, %rbx + movq %r13, %rsi + movq %rax, %rdx + andq $-8, %rax + movq %rbx, %rdi + shrq $3, %rdx + addq $8, %rax + leal 8(,%rdx,8), %ecx + leaq 0(,%rax,8), %rdx + andl $536870904, %ecx + rep movsq + movq %r12, %rcx + leaq (%rbx,%rdx), %rdi + addq %r13, %rdx + subq %rax, %rcx + movl $255, %eax + kmovd %eax, %k1 + cmpq $255, %rcx + jbe .L1793 +.L1797: + leal -1(%r12), %eax + movl $32, %ecx + movl $1, %esi + vmovdqu64 (%rdx), %zmm0{%k1}{z} + bsrl %eax, %eax + xorl $31, %eax + vmovdqa64 %zmm0, (%rdi){%k1} + vpbroadcastq .LC10(%rip), %zmm0 + subl %eax, %ecx + movl $1, %eax + shlx %rcx, %rsi, %rsi + shrq $4, %rsi + cmove %rax, %rsi + movq %r12, %rax + movq %rsi, %rdx + salq $4, %rdx + addq $8, %rdx + cmpq %rdx, %r12 + jnb .L1801 + .p2align 4,,10 + .p2align 3 +.L1798: + vmovdqu64 %zmm0, (%rbx,%rax,8) + addq $8, %rax + cmpq %rdx, %rax + jb .L1798 +.L1801: + movq %rbx, %rdi + vzeroupper + call _ZN3hwy11N_AVX3_ZEN46detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + cmpq $7, %r12 + jbe .L1800 + leaq -8(%r12), %rdx + movq (%rsp), %rcx + leaq 8(%r13), %rdi + movq %rdx, %rax + andq $-8, %rdi + andq $-8, %rdx + shrq $3, %rax + movq %rcx, 0(%r13) + addq $1, %rax + salq $6, %rax + movl %eax, %ecx + movq -8(%rbx,%rcx), %rsi + movq %rsi, -8(%r13,%rcx) + movq %r13, %rcx + movq %rbx, %rsi + subq %rdi, %rcx + addl %ecx, %eax + subq %rcx, %rsi + shrl $3, %eax + movl %eax, %ecx + leaq 8(%rdx), %rax + rep movsq + leaq 0(,%rax,8), %rdx + subq %rax, %r12 + movl $255, %eax + addq %rdx, %r13 + addq %rdx, %rbx + kmovd %eax, %k1 + cmpq $255, %r12 + jbe .L1800 +.L1802: + vmovdqa64 (%rbx), %zmm0{%k1}{z} + vmovdqu64 %zmm0, 0(%r13){%k1} + vzeroupper + jmp .L1810 +.L1818: + movq %rsp, %rbx + movq %rdi, %rdx + movq %rsi, %rcx + movq %rbx, %rdi +.L1793: + movq $-1, %rax + bzhi %rcx, %rax, %rax + movzbl %al, %eax + kmovd %eax, %k1 + jmp .L1797 +.L1800: + movq $-1, %rax + bzhi %r12, %rax, %rax + movzbl %al, %eax + kmovd %eax, %k1 + jmp .L1802 + .cfi_endproc + .section .text.unlikely._ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm + .cfi_startproc + .type _ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm.cold, @function +_ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm.cold: +.LFSB15264: +.L1809: + .cfi_def_cfa 6, 16 + .cfi_offset 3, -40 + .cfi_offset 6, -16 + .cfi_offset 12, -32 + .cfi_offset 13, -24 + call abort@PLT + .cfi_endproc +.LFE15264: + .section .text._ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm + .size _ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm, .-_ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm + .section .text.unlikely._ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm + .size _ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm.cold, .-_ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm.cold +.LCOLDE19: + .section .text._ZN3hwy11N_AVX3_ZEN410SortI64AscEPlm +.LHOTE19: + .section .text.unlikely._ZN3hwy6N_SSE210SortI64AscEPlm,"ax",@progbits +.LCOLDB20: + .section .text._ZN3hwy6N_SSE210SortI64AscEPlm,"ax",@progbits +.LHOTB20: + .p2align 4 + .globl _ZN3hwy6N_SSE210SortI64AscEPlm + .hidden _ZN3hwy6N_SSE210SortI64AscEPlm + .type _ZN3hwy6N_SSE210SortI64AscEPlm, @function +_ZN3hwy6N_SSE210SortI64AscEPlm: +.LFB16254: + .cfi_startproc + cmpq $1, %rsi + jbe .L1840 + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r15 + pushq %r14 + pushq %r13 + .cfi_offset 15, -24 + .cfi_offset 14, -32 + .cfi_offset 13, -40 + movq %rdi, %r13 + pushq %r12 + .cfi_offset 12, -48 + movq %rsi, %r12 + pushq %rbx + subq $312, %rsp + .cfi_offset 3, -56 + cmpq $32, %rsi + jbe .L1843 + cmpq $0, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + je .L1844 +.L1830: + leaq -336(%rbp), %rcx + leaq 0(%r13,%r12,8), %rsi + movq %r12, %rdx + movq %r13, %rdi + movl $50, %r9d + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %r8 + call _ZN3hwy6N_SSE26detail7RecurseINS0_4SimdIlLm2ELi0EEENS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_T0_PT1_SE_mSE_Pmm.isra.0 +.L1819: + addq $312, %rsp + popq %rbx + popq %r12 + popq %r13 + popq %r14 + popq %r15 + popq %rbp + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1840: + .cfi_restore 3 + .cfi_restore 6 + .cfi_restore 12 + .cfi_restore 13 + .cfi_restore 14 + .cfi_restore 15 + ret + .p2align 4,,10 + .p2align 3 +.L1843: + .cfi_def_cfa 6, 16 + .cfi_offset 3, -56 + .cfi_offset 6, -16 + .cfi_offset 12, -48 + .cfi_offset 13, -40 + .cfi_offset 14, -32 + .cfi_offset 15, -24 + leaq 0(,%rsi,8), %r8 + leaq 256(%rdi), %rax + leaq (%rdi,%r8), %rdx + cmpq %rax, %rdx + jb .L1823 + movl $2, %esi + call _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + jmp .L1819 + .p2align 4,,10 + .p2align 3 +.L1844: + xorl %edx, %edx + movl $16, %esi + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %rdi + call getrandom@PLT + cmpq $16, %rax + jne .L1838 + movq $1, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + jmp .L1830 +.L1823: + leaq -2(%rsi), %r14 + movq %r12, %rdx + leaq -336(%rbp), %r15 + movq %r13, %rsi + movq %r14, %rbx + andq $-2, %r14 + movq %r15, %rdi + shrq %rbx + addq $2, %r14 + addq $1, %rbx + salq $4, %rbx + movl %ebx, %ecx + shrl $3, %ecx + subq %r14, %rdx + rep movsq + movq %rdx, -344(%rbp) + je .L1827 + leaq 0(,%r14,8), %rax + salq $3, %rdx + movq %r8, -352(%rbp) + leaq (%r15,%rax), %rdi + leaq 0(%r13,%rax), %rsi + call memcpy@PLT + movq -352(%rbp), %r8 +.L1827: + leal -1(%r12), %eax + movl $32, %ecx + bsrl %eax, %eax + xorl $31, %eax + subl %eax, %ecx + movl $1, %eax + salq %cl, %rax + shrq $4, %rax + movq %rax, %rsi + movl $1, %eax + cmove %rax, %rsi + movq %rsi, %rdx + salq $4, %rdx + leaq 2(%rdx), %rax + cmpq %rax, %r12 + jnb .L1828 + movdqa .LC4(%rip), %xmm0 + leaq 2(%r12), %rdx + movups %xmm0, -336(%rbp,%r8) + cmpq %rdx, %rax + jbe .L1828 + movups %xmm0, -320(%rbp,%r8) + leaq 4(%r12), %rdx + cmpq %rdx, %rax + jbe .L1828 + movups %xmm0, -304(%rbp,%r8) + leaq 6(%r12), %rdx + cmpq %rdx, %rax + jbe .L1828 + movups %xmm0, -288(%rbp,%r8) + leaq 8(%r12), %rdx + cmpq %rdx, %rax + jbe .L1828 + movups %xmm0, -272(%rbp,%r8) + leaq 10(%r12), %rdx + cmpq %rdx, %rax + jbe .L1828 + movups %xmm0, -256(%rbp,%r8) + leaq 12(%r12), %rdx + cmpq %rdx, %rax + jbe .L1828 + movups %xmm0, -240(%rbp,%r8) + leaq 14(%r12), %rdx + cmpq %rdx, %rax + jbe .L1828 + movups %xmm0, -224(%rbp,%r8) + leaq 16(%r12), %rdx + cmpq %rdx, %rax + jbe .L1828 + movups %xmm0, -208(%rbp,%r8) + leaq 18(%r12), %rdx + cmpq %rdx, %rax + jbe .L1828 + movups %xmm0, -192(%rbp,%r8) + leaq 20(%r12), %rdx + cmpq %rdx, %rax + jbe .L1828 + movups %xmm0, -176(%rbp,%r8) + leaq 22(%r12), %rdx + cmpq %rdx, %rax + jbe .L1828 + movups %xmm0, -160(%rbp,%r8) + leaq 24(%r12), %rdx + cmpq %rdx, %rax + jbe .L1828 + movups %xmm0, -144(%rbp,%r8) + leaq 26(%r12), %rdx + cmpq %rdx, %rax + jbe .L1828 + leaq 28(%r12), %rdx + movups %xmm0, -128(%rbp,%r8) + cmpq %rdx, %rax + jbe .L1828 + addq $30, %r12 + movups %xmm0, -112(%rbp,%r8) + cmpq %r12, %rax + jbe .L1828 + movups %xmm0, -96(%rbp,%r8) +.L1828: + movq %r15, %rdi + call _ZN3hwy6N_SSE26detail14SortingNetworkINS1_12SharedTraitsINS1_10TraitsLaneINS1_14OrderAscendingIlEEEEEElEEvT_PT0_m.isra.0 + movq -336(%rbp), %rax + leaq 8(%r13), %rdi + movq %r15, %rsi + andq $-8, %rdi + movq %rax, 0(%r13) + movl %ebx, %eax + movq -8(%r15,%rax), %rdx + movq %rdx, -8(%r13,%rax) + movq %r13, %rax + subq %rdi, %rax + addl %eax, %ebx + subq %rax, %rsi + shrl $3, %ebx + movl %ebx, %ecx + rep movsq + movq -344(%rbp), %rax + testq %rax, %rax + je .L1819 + salq $3, %r14 + salq $3, %rax + leaq 0(%r13,%r14), %rdi + movq %rax, %rdx + leaq (%r15,%r14), %rsi + call memcpy@PLT + jmp .L1819 + .cfi_endproc + .section .text.unlikely._ZN3hwy6N_SSE210SortI64AscEPlm + .cfi_startproc + .type _ZN3hwy6N_SSE210SortI64AscEPlm.cold, @function +_ZN3hwy6N_SSE210SortI64AscEPlm.cold: +.LFSB16254: +.L1838: + .cfi_def_cfa 6, 16 + .cfi_offset 3, -56 + .cfi_offset 6, -16 + .cfi_offset 12, -48 + .cfi_offset 13, -40 + .cfi_offset 14, -32 + .cfi_offset 15, -24 + call abort@PLT + .cfi_endproc +.LFE16254: + .section .text._ZN3hwy6N_SSE210SortI64AscEPlm + .size _ZN3hwy6N_SSE210SortI64AscEPlm, .-_ZN3hwy6N_SSE210SortI64AscEPlm + .section .text.unlikely._ZN3hwy6N_SSE210SortI64AscEPlm + .size _ZN3hwy6N_SSE210SortI64AscEPlm.cold, .-_ZN3hwy6N_SSE210SortI64AscEPlm.cold +.LCOLDE20: + .section .text._ZN3hwy6N_SSE210SortI64AscEPlm +.LHOTE20: + .section .text.unlikely._ZN3hwy17GetGeneratorStateEv,"ax",@progbits +.LCOLDB21: + .section .text._ZN3hwy17GetGeneratorStateEv,"ax",@progbits +.LHOTB21: + .p2align 4 + .globl _ZN3hwy17GetGeneratorStateEv + .hidden _ZN3hwy17GetGeneratorStateEv + .type _ZN3hwy17GetGeneratorStateEv, @function +_ZN3hwy17GetGeneratorStateEv: +.LFB16255: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + pushq %r12 + .cfi_offset 12, -24 + leaq _ZZN3hwy17GetGeneratorStateEvE5state(%rip), %r12 + subq $8, %rsp + cmpq $0, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + je .L1850 + movq %r12, %rax + movq -8(%rbp), %r12 + leave + .cfi_remember_state + .cfi_def_cfa 7, 8 + ret + .p2align 4,,10 + .p2align 3 +.L1850: + .cfi_restore_state + xorl %edx, %edx + movl $16, %esi + movq %r12, %rdi + call getrandom@PLT + cmpq $16, %rax + jne .L1848 + movq $1, 16+_ZZN3hwy17GetGeneratorStateEvE5state(%rip) + movq %r12, %rax + movq -8(%rbp), %r12 + leave + .cfi_def_cfa 7, 8 + ret + .cfi_endproc + .section .text.unlikely._ZN3hwy17GetGeneratorStateEv + .cfi_startproc + .type _ZN3hwy17GetGeneratorStateEv.cold, @function +_ZN3hwy17GetGeneratorStateEv.cold: +.LFSB16255: +.L1848: + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + .cfi_offset 12, -24 + call abort@PLT + .cfi_endproc +.LFE16255: + .section .text._ZN3hwy17GetGeneratorStateEv + .size _ZN3hwy17GetGeneratorStateEv, .-_ZN3hwy17GetGeneratorStateEv + .section .text.unlikely._ZN3hwy17GetGeneratorStateEv + .size _ZN3hwy17GetGeneratorStateEv.cold, .-_ZN3hwy17GetGeneratorStateEv.cold +.LCOLDE21: + .section .text._ZN3hwy17GetGeneratorStateEv +.LHOTE21: + .section .text.vqsort_int64_avx2,"ax",@progbits + .p2align 4 + .globl vqsort_int64_avx2 + .hidden vqsort_int64_avx2 + .type vqsort_int64_avx2, @function +vqsort_int64_avx2: +.LFB16256: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + call _ZN3hwy6N_AVX210SortI64AscEPlm + popq %rbp + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE16256: + .size vqsort_int64_avx2, .-vqsort_int64_avx2 + .section .text.vqsort_int64_sse4,"ax",@progbits + .p2align 4 + .globl vqsort_int64_sse4 + .hidden vqsort_int64_sse4 + .type vqsort_int64_sse4, @function +vqsort_int64_sse4: +.LFB16257: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + call _ZN3hwy6N_SSE410SortI64AscEPlm + popq %rbp + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE16257: + .size vqsort_int64_sse4, .-vqsort_int64_sse4 + .section .text.vqsort_int64_ssse3,"ax",@progbits + .p2align 4 + .globl vqsort_int64_ssse3 + .hidden vqsort_int64_ssse3 + .type vqsort_int64_ssse3, @function +vqsort_int64_ssse3: +.LFB16258: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + call _ZN3hwy7N_SSSE310SortI64AscEPlm + popq %rbp + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE16258: + .size vqsort_int64_ssse3, .-vqsort_int64_ssse3 + .section .text.vqsort_int64_sse2,"ax",@progbits + .p2align 4 + .globl vqsort_int64_sse2 + .hidden vqsort_int64_sse2 + .type vqsort_int64_sse2, @function +vqsort_int64_sse2: +.LFB16259: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + call _ZN3hwy6N_SSE210SortI64AscEPlm + popq %rbp + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE16259: + .size vqsort_int64_sse2, .-vqsort_int64_sse2 + .hidden _ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices + .weak _ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices + .section .rodata._ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices,"aG",@progbits,_ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices,comdat + .align 32 + .type _ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices, @object + .size _ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices, 512 +_ZZN3hwy6N_AVX26detail21IndicesFromNotBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices: + .long 8 + .long 9 + .long 10 + .long 11 + .long 12 + .long 13 + .long 14 + .long 15 + .long 10 + .long 11 + .long 12 + .long 13 + .long 14 + .long 15 + .long 8 + .long 9 + .long 8 + .long 9 + .long 12 + .long 13 + .long 14 + .long 15 + .long 10 + .long 11 + .long 12 + .long 13 + .long 14 + .long 15 + .long 8 + .long 9 + .long 10 + .long 11 + .long 8 + .long 9 + .long 10 + .long 11 + .long 14 + .long 15 + .long 12 + .long 13 + .long 10 + .long 11 + .long 14 + .long 15 + .long 8 + .long 9 + .long 12 + .long 13 + .long 8 + .long 9 + .long 14 + .long 15 + .long 10 + .long 11 + .long 12 + .long 13 + .long 14 + .long 15 + .long 8 + .long 9 + .long 10 + .long 11 + .long 12 + .long 13 + .long 8 + .long 9 + .long 10 + .long 11 + .long 12 + .long 13 + .long 14 + .long 15 + .long 10 + .long 11 + .long 12 + .long 13 + .long 8 + .long 9 + .long 14 + .long 15 + .long 8 + .long 9 + .long 12 + .long 13 + .long 10 + .long 11 + .long 14 + .long 15 + .long 12 + .long 13 + .long 8 + .long 9 + .long 10 + .long 11 + .long 14 + .long 15 + .long 8 + .long 9 + .long 10 + .long 11 + .long 12 + .long 13 + .long 14 + .long 15 + .long 10 + .long 11 + .long 8 + .long 9 + .long 12 + .long 13 + .long 14 + .long 15 + .long 8 + .long 9 + .long 10 + .long 11 + .long 12 + .long 13 + .long 14 + .long 15 + .long 8 + .long 9 + .long 10 + .long 11 + .long 12 + .long 13 + .long 14 + .long 15 + .section .rodata._ZZN3hwy6N_AVX3L11CompressNotIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array,"a" + .align 16 + .type _ZZN3hwy6N_AVX3L11CompressNotIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array, @object + .size _ZZN3hwy6N_AVX3L11CompressNotIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array, 2048 +_ZZN3hwy6N_AVX3L11CompressNotIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array: + .quad 1985229328 + .quad 124076833 + .quad 392512288 + .quad 276190258 + .quad 660947728 + .quad 544625713 + .quad 561402928 + .quad 554132803 + .quad 929382928 + .quad 813061153 + .quad 829838368 + .quad 822568258 + .quad 846615568 + .quad 839345473 + .quad 840394048 + .quad 839939668 + .quad 1197814288 + .quad 1081496353 + .quad 1098273568 + .quad 1091003698 + .quad 1115050768 + .quad 1107780913 + .quad 1108829488 + .quad 1108375123 + .quad 1131827728 + .quad 1124558113 + .quad 1125606688 + .quad 1125152338 + .quad 1126655248 + .quad 1126200913 + .quad 1126266448 + .quad 1126238053 + .quad 1466184208 + .quad 1349927713 + .quad 1366704928 + .quad 1359438898 + .quad 1383482128 + .quad 1376216113 + .quad 1377264688 + .quad 1376810563 + .quad 1400259088 + .quad 1392993313 + .quad 1394041888 + .quad 1393587778 + .quad 1395090448 + .quad 1394636353 + .quad 1394701888 + .quad 1394673508 + .quad 1417032208 + .quad 1409770273 + .quad 1410818848 + .quad 1410364978 + .quad 1411867408 + .quad 1411413553 + .quad 1411479088 + .quad 1411450723 + .quad 1412915728 + .quad 1412462113 + .quad 1412527648 + .quad 1412499298 + .quad 1412593168 + .quad 1412564833 + .quad 1412568928 + .quad 1412567158 + .quad 1733571088 + .quad 1618297633 + .quad 1635074848 + .quad 1627870258 + .quad 1651852048 + .quad 1644647473 + .quad 1645696048 + .quad 1645245763 + .quad 1668629008 + .quad 1661424673 + .quad 1662473248 + .quad 1662022978 + .quad 1663521808 + .quad 1663071553 + .quad 1663137088 + .quad 1663108948 + .quad 1685402128 + .quad 1678201633 + .quad 1679250208 + .quad 1678800178 + .quad 1680298768 + .quad 1679848753 + .quad 1679914288 + .quad 1679886163 + .quad 1681347088 + .quad 1680897313 + .quad 1680962848 + .quad 1680934738 + .quad 1681028368 + .quad 1681000273 + .quad 1681004368 + .quad 1681002613 + .quad 1702113808 + .quad 1694974753 + .quad 1696023328 + .quad 1695577138 + .quad 1697071888 + .quad 1696625713 + .quad 1696691248 + .quad 1696663363 + .quad 1698120208 + .quad 1697674273 + .quad 1697739808 + .quad 1697711938 + .quad 1697805328 + .quad 1697777473 + .quad 1697781568 + .quad 1697779828 + .quad 1699164688 + .quad 1698722593 + .quad 1698788128 + .quad 1698760498 + .quad 1698853648 + .quad 1698826033 + .quad 1698830128 + .quad 1698828403 + .quad 1698918928 + .quad 1698891553 + .quad 1698895648 + .quad 1698893938 + .quad 1698899728 + .quad 1698898033 + .quad 1698898288 + .quad 1698898183 + .quad 1985229328 + .quad 1885684513 + .quad 1902461728 + .quad 1896240178 + .quad 1919238928 + .quad 1913017393 + .quad 1914065968 + .quad 1913677123 + .quad 1936015888 + .quad 1929794593 + .quad 1930843168 + .quad 1930454338 + .quad 1931891728 + .quad 1931502913 + .quad 1931568448 + .quad 1931544148 + .quad 1952789008 + .quad 1946571553 + .quad 1947620128 + .quad 1947231538 + .quad 1948668688 + .quad 1948280113 + .quad 1948345648 + .quad 1948321363 + .quad 1949717008 + .quad 1949328673 + .quad 1949394208 + .quad 1949369938 + .quad 1949459728 + .quad 1949435473 + .quad 1949439568 + .quad 1949438053 + .quad 1969500688 + .quad 1963344673 + .quad 1964393248 + .quad 1964008498 + .quad 1965441808 + .quad 1965057073 + .quad 1965122608 + .quad 1965098563 + .quad 1966490128 + .quad 1966105633 + .quad 1966171168 + .quad 1966147138 + .quad 1966236688 + .quad 1966212673 + .quad 1966216768 + .quad 1966215268 + .quad 1967534608 + .quad 1967153953 + .quad 1967219488 + .quad 1967195698 + .quad 1967285008 + .quad 1967261233 + .quad 1967265328 + .quad 1967263843 + .quad 1967350288 + .quad 1967326753 + .quad 1967330848 + .quad 1967329378 + .quad 1967334928 + .quad 1967333473 + .quad 1967333728 + .quad 1967333638 + .quad 1985229328 + .quad 1980056353 + .quad 1981104928 + .quad 1980781618 + .quad 1982153488 + .quad 1981830193 + .quad 1981895728 + .quad 1981875523 + .quad 1983201808 + .quad 1982878753 + .quad 1982944288 + .quad 1982924098 + .quad 1983009808 + .quad 1982989633 + .quad 1982993728 + .quad 1982992468 + .quad 1984246288 + .quad 1983927073 + .quad 1983992608 + .quad 1983972658 + .quad 1984058128 + .quad 1984038193 + .quad 1984042288 + .quad 1984041043 + .quad 1984123408 + .quad 1984103713 + .quad 1984107808 + .quad 1984106578 + .quad 1984111888 + .quad 1984110673 + .quad 1984110928 + .quad 1984110853 + .quad 1985229328 + .quad 1984971553 + .quad 1985037088 + .quad 1985020978 + .quad 1985102608 + .quad 1985086513 + .quad 1985090608 + .quad 1985089603 + .quad 1985167888 + .quad 1985152033 + .quad 1985156128 + .quad 1985155138 + .quad 1985160208 + .quad 1985159233 + .quad 1985159488 + .quad 1985159428 + .quad 1985229328 + .quad 1985217313 + .quad 1985221408 + .quad 1985220658 + .quad 1985225488 + .quad 1985224753 + .quad 1985225008 + .quad 1985224963 + .quad 1985229328 + .quad 1985228833 + .quad 1985229088 + .quad 1985229058 + .quad 1985229328 + .quad 1985229313 + .quad 1985229328 + .quad 1985229328 + .set _ZZN3hwy11N_AVX3_ZEN4L11CompressNotIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array,_ZZN3hwy6N_AVX3L11CompressNotIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array + .hidden _ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .weak _ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .section .rodata._ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,"aG",@progbits,_ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,comdat + .align 16 + .type _ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, @object + .size _ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, 64 +_ZZN3hwy6N_SSE26detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices: + .string "" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007" + .ascii "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .section .rodata._ZZN3hwy11N_AVX3_ZEN4L8CompressIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array,"a" + .align 16 + .type _ZZN3hwy11N_AVX3_ZEN4L8CompressIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array, @object + .size _ZZN3hwy11N_AVX3_ZEN4L8CompressIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array, 2048 +_ZZN3hwy11N_AVX3_ZEN4L8CompressIlLPv0EEENS0_6Vec512IT_EES5_NS0_7Mask512IS4_EEE12packed_array: + .quad 1985229328 + .quad 1985229328 + .quad 1985229313 + .quad 1985229328 + .quad 1985229058 + .quad 1985229088 + .quad 1985228833 + .quad 1985229328 + .quad 1985224963 + .quad 1985225008 + .quad 1985224753 + .quad 1985225488 + .quad 1985220658 + .quad 1985221408 + .quad 1985217313 + .quad 1985229328 + .quad 1985159428 + .quad 1985159488 + .quad 1985159233 + .quad 1985160208 + .quad 1985155138 + .quad 1985156128 + .quad 1985152033 + .quad 1985167888 + .quad 1985089603 + .quad 1985090608 + .quad 1985086513 + .quad 1985102608 + .quad 1985020978 + .quad 1985037088 + .quad 1984971553 + .quad 1985229328 + .quad 1984110853 + .quad 1984110928 + .quad 1984110673 + .quad 1984111888 + .quad 1984106578 + .quad 1984107808 + .quad 1984103713 + .quad 1984123408 + .quad 1984041043 + .quad 1984042288 + .quad 1984038193 + .quad 1984058128 + .quad 1983972658 + .quad 1983992608 + .quad 1983927073 + .quad 1984246288 + .quad 1982992468 + .quad 1982993728 + .quad 1982989633 + .quad 1983009808 + .quad 1982924098 + .quad 1982944288 + .quad 1982878753 + .quad 1983201808 + .quad 1981875523 + .quad 1981895728 + .quad 1981830193 + .quad 1982153488 + .quad 1980781618 + .quad 1981104928 + .quad 1980056353 + .quad 1985229328 + .quad 1967333638 + .quad 1967333728 + .quad 1967333473 + .quad 1967334928 + .quad 1967329378 + .quad 1967330848 + .quad 1967326753 + .quad 1967350288 + .quad 1967263843 + .quad 1967265328 + .quad 1967261233 + .quad 1967285008 + .quad 1967195698 + .quad 1967219488 + .quad 1967153953 + .quad 1967534608 + .quad 1966215268 + .quad 1966216768 + .quad 1966212673 + .quad 1966236688 + .quad 1966147138 + .quad 1966171168 + .quad 1966105633 + .quad 1966490128 + .quad 1965098563 + .quad 1965122608 + .quad 1965057073 + .quad 1965441808 + .quad 1964008498 + .quad 1964393248 + .quad 1963344673 + .quad 1969500688 + .quad 1949438053 + .quad 1949439568 + .quad 1949435473 + .quad 1949459728 + .quad 1949369938 + .quad 1949394208 + .quad 1949328673 + .quad 1949717008 + .quad 1948321363 + .quad 1948345648 + .quad 1948280113 + .quad 1948668688 + .quad 1947231538 + .quad 1947620128 + .quad 1946571553 + .quad 1952789008 + .quad 1931544148 + .quad 1931568448 + .quad 1931502913 + .quad 1931891728 + .quad 1930454338 + .quad 1930843168 + .quad 1929794593 + .quad 1936015888 + .quad 1913677123 + .quad 1914065968 + .quad 1913017393 + .quad 1919238928 + .quad 1896240178 + .quad 1902461728 + .quad 1885684513 + .quad 1985229328 + .quad 1698898183 + .quad 1698898288 + .quad 1698898033 + .quad 1698899728 + .quad 1698893938 + .quad 1698895648 + .quad 1698891553 + .quad 1698918928 + .quad 1698828403 + .quad 1698830128 + .quad 1698826033 + .quad 1698853648 + .quad 1698760498 + .quad 1698788128 + .quad 1698722593 + .quad 1699164688 + .quad 1697779828 + .quad 1697781568 + .quad 1697777473 + .quad 1697805328 + .quad 1697711938 + .quad 1697739808 + .quad 1697674273 + .quad 1698120208 + .quad 1696663363 + .quad 1696691248 + .quad 1696625713 + .quad 1697071888 + .quad 1695577138 + .quad 1696023328 + .quad 1694974753 + .quad 1702113808 + .quad 1681002613 + .quad 1681004368 + .quad 1681000273 + .quad 1681028368 + .quad 1680934738 + .quad 1680962848 + .quad 1680897313 + .quad 1681347088 + .quad 1679886163 + .quad 1679914288 + .quad 1679848753 + .quad 1680298768 + .quad 1678800178 + .quad 1679250208 + .quad 1678201633 + .quad 1685402128 + .quad 1663108948 + .quad 1663137088 + .quad 1663071553 + .quad 1663521808 + .quad 1662022978 + .quad 1662473248 + .quad 1661424673 + .quad 1668629008 + .quad 1645245763 + .quad 1645696048 + .quad 1644647473 + .quad 1651852048 + .quad 1627870258 + .quad 1635074848 + .quad 1618297633 + .quad 1733571088 + .quad 1412567158 + .quad 1412568928 + .quad 1412564833 + .quad 1412593168 + .quad 1412499298 + .quad 1412527648 + .quad 1412462113 + .quad 1412915728 + .quad 1411450723 + .quad 1411479088 + .quad 1411413553 + .quad 1411867408 + .quad 1410364978 + .quad 1410818848 + .quad 1409770273 + .quad 1417032208 + .quad 1394673508 + .quad 1394701888 + .quad 1394636353 + .quad 1395090448 + .quad 1393587778 + .quad 1394041888 + .quad 1392993313 + .quad 1400259088 + .quad 1376810563 + .quad 1377264688 + .quad 1376216113 + .quad 1383482128 + .quad 1359438898 + .quad 1366704928 + .quad 1349927713 + .quad 1466184208 + .quad 1126238053 + .quad 1126266448 + .quad 1126200913 + .quad 1126655248 + .quad 1125152338 + .quad 1125606688 + .quad 1124558113 + .quad 1131827728 + .quad 1108375123 + .quad 1108829488 + .quad 1107780913 + .quad 1115050768 + .quad 1091003698 + .quad 1098273568 + .quad 1081496353 + .quad 1197814288 + .quad 839939668 + .quad 840394048 + .quad 839345473 + .quad 846615568 + .quad 822568258 + .quad 829838368 + .quad 813061153 + .quad 929382928 + .quad 554132803 + .quad 561402928 + .quad 544625713 + .quad 660947728 + .quad 276190258 + .quad 392512288 + .quad 124076833 + .quad 1985229328 + .hidden _ZZN3hwy6N_AVX26detail18IndicesFromBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices + .weak _ZZN3hwy6N_AVX26detail18IndicesFromBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices + .section .rodata._ZZN3hwy6N_AVX26detail18IndicesFromBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices,"aG",@progbits,_ZZN3hwy6N_AVX26detail18IndicesFromBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices,comdat + .align 32 + .type _ZZN3hwy6N_AVX26detail18IndicesFromBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices, @object + .size _ZZN3hwy6N_AVX26detail18IndicesFromBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices, 512 +_ZZN3hwy6N_AVX26detail18IndicesFromBits256IlLPv0EEENS0_6Vec256IjEEmE11u32_indices: + .long 0 + .long 1 + .long 2 + .long 3 + .long 4 + .long 5 + .long 6 + .long 7 + .long 8 + .long 9 + .long 2 + .long 3 + .long 4 + .long 5 + .long 6 + .long 7 + .long 10 + .long 11 + .long 0 + .long 1 + .long 4 + .long 5 + .long 6 + .long 7 + .long 8 + .long 9 + .long 10 + .long 11 + .long 4 + .long 5 + .long 6 + .long 7 + .long 12 + .long 13 + .long 0 + .long 1 + .long 2 + .long 3 + .long 6 + .long 7 + .long 8 + .long 9 + .long 12 + .long 13 + .long 2 + .long 3 + .long 6 + .long 7 + .long 10 + .long 11 + .long 12 + .long 13 + .long 0 + .long 1 + .long 6 + .long 7 + .long 8 + .long 9 + .long 10 + .long 11 + .long 12 + .long 13 + .long 6 + .long 7 + .long 14 + .long 15 + .long 0 + .long 1 + .long 2 + .long 3 + .long 4 + .long 5 + .long 8 + .long 9 + .long 14 + .long 15 + .long 2 + .long 3 + .long 4 + .long 5 + .long 10 + .long 11 + .long 14 + .long 15 + .long 0 + .long 1 + .long 4 + .long 5 + .long 8 + .long 9 + .long 10 + .long 11 + .long 14 + .long 15 + .long 4 + .long 5 + .long 12 + .long 13 + .long 14 + .long 15 + .long 0 + .long 1 + .long 2 + .long 3 + .long 8 + .long 9 + .long 12 + .long 13 + .long 14 + .long 15 + .long 2 + .long 3 + .long 10 + .long 11 + .long 12 + .long 13 + .long 14 + .long 15 + .long 0 + .long 1 + .long 8 + .long 9 + .long 10 + .long 11 + .long 12 + .long 13 + .long 14 + .long 15 + .hidden _ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .weak _ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .section .rodata._ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,"aG",@progbits,_ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,comdat + .align 16 + .type _ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, @object + .size _ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, 64 +_ZZN3hwy6N_SSE46detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices: + .string "" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007" + .ascii "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .hidden _ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .weak _ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices + .section .rodata._ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,"aG",@progbits,_ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices,comdat + .align 16 + .type _ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, @object + .size _ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices, 64 +_ZZN3hwy7N_SSSE36detail18IndicesFromBits128INS0_4SimdIlLm2ELi0EEELPv0EEEDTcl4ZerocvT__EEES6_mE10u8_indices: + .string "" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\b\t\n\013\f\r\016\017" + .string "\001\002\003\004\005\006\007" + .ascii "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017" + .section .bss._ZZN3hwy17GetGeneratorStateEvE5state,"aw",@nobits + .align 16 + .type _ZZN3hwy17GetGeneratorStateEvE5state, @object + .size _ZZN3hwy17GetGeneratorStateEvE5state, 24 +_ZZN3hwy17GetGeneratorStateEvE5state: + .zero 24 + .set .LC0,.LC3 + .set .LC1,.LC3 + .section .rodata + .align 64 +.LC2: + .quad 7 + .quad 6 + .quad 5 + .quad 4 + .quad 3 + .quad 2 + .quad 1 + .quad 0 + .section .rodata.cst32,"aM",@progbits,32 + .align 32 +.LC3: + .quad 0 + .quad 1 + .quad 2 + .quad 3 + .set .LC4,.LC7 + .set .LC5,.LC8 + .set .LC6,.LC7 + .section .rodata + .align 64 +.LC7: + .quad 9223372036854775807 + .quad 9223372036854775807 + .quad 9223372036854775807 + .quad 9223372036854775807 + .quad 9223372036854775807 + .quad 9223372036854775807 + .quad 9223372036854775807 + .quad 9223372036854775807 + .align 64 +.LC8: + .quad -9223372036854775808 + .quad -9223372036854775808 + .quad -9223372036854775808 + .quad -9223372036854775808 + .quad -9223372036854775808 + .quad -9223372036854775808 + .quad -9223372036854775808 + .quad -9223372036854775808 + .align 64 +.LC9: + .quad 0 + .quad 4 + .quad 8 + .quad 12 + .quad 16 + .quad 20 + .quad 24 + .quad 28 + .set .LC10,.LC7 + .set .LC12,.LC8 + .set .LC13,.LC7 + .set .LC14,.LC8 diff --git a/third_party/vqsort/vqsort_int32.c b/third_party/vqsort/vqsort_int32.c new file mode 100644 index 000000000..3d5fdea24 --- /dev/null +++ b/third_party/vqsort/vqsort_int32.c @@ -0,0 +1,29 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2023 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/mem/alg.h" +#include "libc/nexgen32e/x86feature.h" +#include "third_party/vqsort/vqsort.h" + +void vqsort_int32(int32_t *A, size_t n) { + if (X86_HAVE(AVX2)) { + vqsort_int32_avx2(A, n); + } else { + radix_sort_int32(A, n); + } +} diff --git a/third_party/vqsort/vqsort_int64.c b/third_party/vqsort/vqsort_int64.c new file mode 100644 index 000000000..57498dd46 --- /dev/null +++ b/third_party/vqsort/vqsort_int64.c @@ -0,0 +1,29 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2023 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/mem/alg.h" +#include "libc/nexgen32e/x86feature.h" +#include "third_party/vqsort/vqsort.h" + +void vqsort_int64(int64_t *A, size_t n) { + if (X86_HAVE(AVX2)) { + vqsort_int64_avx2(A, n); + } else { + radix_sort_int64(A, n); + } +}